llama_cpp 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +17 -0
 - data/README.md +1 -1
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +3 -3
 - data/ext/llama_cpp/llama_cpp.cpp +118 -117
 - data/ext/llama_cpp/src/ggml-alloc.c +97 -53
 - data/ext/llama_cpp/src/ggml-alloc.h +4 -0
 - data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
 - data/ext/llama_cpp/src/ggml-cuda.h +32 -23
 - data/ext/llama_cpp/src/ggml-metal.h +9 -3
 - data/ext/llama_cpp/src/ggml-metal.m +142 -161
 - data/ext/llama_cpp/src/ggml-metal.metal +577 -500
 - data/ext/llama_cpp/src/ggml.c +2064 -233
 - data/ext/llama_cpp/src/ggml.h +238 -13
 - data/ext/llama_cpp/src/k_quants.c +110 -54
 - data/ext/llama_cpp/src/llama-util.h +10 -8
 - data/ext/llama_cpp/src/llama.cpp +4544 -2890
 - data/ext/llama_cpp/src/llama.h +133 -123
 - data/lib/llama_cpp/version.rb +2 -2
 - data/lib/llama_cpp.rb +1 -1
 - data/sig/llama_cpp.rbs +8 -8
 - metadata +2 -2
 
| 
         @@ -6,15 +6,116 @@ 
     | 
|
| 
       6 
6 
     | 
    
         
             
            #include <atomic>
         
     | 
| 
       7 
7 
     | 
    
         
             
            #include <assert.h>
         
     | 
| 
       8 
8 
     | 
    
         | 
| 
      
 9 
     | 
    
         
            +
            #if defined(GGML_USE_HIPBLAS)
         
     | 
| 
      
 10 
     | 
    
         
            +
            #include <hip/hip_runtime.h>
         
     | 
| 
      
 11 
     | 
    
         
            +
            #include <hipblas/hipblas.h>
         
     | 
| 
      
 12 
     | 
    
         
            +
            #include <hip/hip_fp16.h>
         
     | 
| 
      
 13 
     | 
    
         
            +
            #ifdef __HIP_PLATFORM_AMD__
         
     | 
| 
      
 14 
     | 
    
         
            +
            // for rocblas_initialize()
         
     | 
| 
      
 15 
     | 
    
         
            +
            #include "rocblas/rocblas.h"
         
     | 
| 
      
 16 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 17 
     | 
    
         
            +
            #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
         
     | 
| 
      
 18 
     | 
    
         
            +
            #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
         
     | 
| 
      
 19 
     | 
    
         
            +
            #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
         
     | 
| 
      
 20 
     | 
    
         
            +
            #define CUBLAS_OP_N HIPBLAS_OP_N
         
     | 
| 
      
 21 
     | 
    
         
            +
            #define CUBLAS_OP_T HIPBLAS_OP_T
         
     | 
| 
      
 22 
     | 
    
         
            +
            #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
         
     | 
| 
      
 23 
     | 
    
         
            +
            #define CUBLAS_TF32_TENSOR_OP_MATH 0
         
     | 
| 
      
 24 
     | 
    
         
            +
            #define CUDA_R_16F  HIPBLAS_R_16F
         
     | 
| 
      
 25 
     | 
    
         
            +
            #define CUDA_R_32F  HIPBLAS_R_32F
         
     | 
| 
      
 26 
     | 
    
         
            +
            #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
         
     | 
| 
      
 27 
     | 
    
         
            +
            #define cublasCreate hipblasCreate
         
     | 
| 
      
 28 
     | 
    
         
            +
            #define cublasGemmEx hipblasGemmEx
         
     | 
| 
      
 29 
     | 
    
         
            +
            #define cublasHandle_t hipblasHandle_t
         
     | 
| 
      
 30 
     | 
    
         
            +
            #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
         
     | 
| 
      
 31 
     | 
    
         
            +
            #define cublasSetStream hipblasSetStream
         
     | 
| 
      
 32 
     | 
    
         
            +
            #define cublasSgemm hipblasSgemm
         
     | 
| 
      
 33 
     | 
    
         
            +
            #define cublasStatus_t hipblasStatus_t
         
     | 
| 
      
 34 
     | 
    
         
            +
            #define cudaDeviceProp hipDeviceProp_t
         
     | 
| 
      
 35 
     | 
    
         
            +
            #define cudaDeviceSynchronize hipDeviceSynchronize
         
     | 
| 
      
 36 
     | 
    
         
            +
            #define cudaError_t hipError_t
         
     | 
| 
      
 37 
     | 
    
         
            +
            #define cudaEventCreateWithFlags hipEventCreateWithFlags
         
     | 
| 
      
 38 
     | 
    
         
            +
            #define cudaEventDisableTiming hipEventDisableTiming
         
     | 
| 
      
 39 
     | 
    
         
            +
            #define cudaEventRecord hipEventRecord
         
     | 
| 
      
 40 
     | 
    
         
            +
            #define cudaEvent_t hipEvent_t
         
     | 
| 
      
 41 
     | 
    
         
            +
            #define cudaEventDestroy hipEventDestroy
         
     | 
| 
      
 42 
     | 
    
         
            +
            #define cudaFree hipFree
         
     | 
| 
      
 43 
     | 
    
         
            +
            #define cudaFreeHost hipHostFree
         
     | 
| 
      
 44 
     | 
    
         
            +
            #define cudaGetDevice hipGetDevice
         
     | 
| 
      
 45 
     | 
    
         
            +
            #define cudaGetDeviceCount hipGetDeviceCount
         
     | 
| 
      
 46 
     | 
    
         
            +
            #define cudaGetDeviceProperties hipGetDeviceProperties
         
     | 
| 
      
 47 
     | 
    
         
            +
            #define cudaGetErrorString hipGetErrorString
         
     | 
| 
      
 48 
     | 
    
         
            +
            #define cudaGetLastError hipGetLastError
         
     | 
| 
      
 49 
     | 
    
         
            +
            #define cudaMalloc hipMalloc
         
     | 
| 
      
 50 
     | 
    
         
            +
            #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
         
     | 
| 
      
 51 
     | 
    
         
            +
            #define cudaMemcpy hipMemcpy
         
     | 
| 
      
 52 
     | 
    
         
            +
            #define cudaMemcpy2DAsync hipMemcpy2DAsync
         
     | 
| 
      
 53 
     | 
    
         
            +
            #define cudaMemcpyAsync hipMemcpyAsync
         
     | 
| 
      
 54 
     | 
    
         
            +
            #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
         
     | 
| 
      
 55 
     | 
    
         
            +
            #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
         
     | 
| 
      
 56 
     | 
    
         
            +
            #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
         
     | 
| 
      
 57 
     | 
    
         
            +
            #define cudaMemcpyKind hipMemcpyKind
         
     | 
| 
      
 58 
     | 
    
         
            +
            #define cudaMemset hipMemset
         
     | 
| 
      
 59 
     | 
    
         
            +
            #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
         
     | 
| 
      
 60 
     | 
    
         
            +
            #define cudaSetDevice hipSetDevice
         
     | 
| 
      
 61 
     | 
    
         
            +
            #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
         
     | 
| 
      
 62 
     | 
    
         
            +
            #define cudaStreamNonBlocking hipStreamNonBlocking
         
     | 
| 
      
 63 
     | 
    
         
            +
            #define cudaStreamSynchronize hipStreamSynchronize
         
     | 
| 
      
 64 
     | 
    
         
            +
            #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
         
     | 
| 
      
 65 
     | 
    
         
            +
            #define cudaStream_t hipStream_t
         
     | 
| 
      
 66 
     | 
    
         
            +
            #define cudaSuccess hipSuccess
         
     | 
| 
      
 67 
     | 
    
         
            +
            #else
         
     | 
| 
       9 
68 
     | 
    
         
             
            #include <cuda_runtime.h>
         
     | 
| 
       10 
69 
     | 
    
         
             
            #include <cublas_v2.h>
         
     | 
| 
       11 
70 
     | 
    
         
             
            #include <cuda_fp16.h>
         
     | 
| 
      
 71 
     | 
    
         
            +
            #endif
         
     | 
| 
       12 
72 
     | 
    
         | 
| 
       13 
73 
     | 
    
         
             
            #include "ggml-cuda.h"
         
     | 
| 
       14 
74 
     | 
    
         
             
            #include "ggml.h"
         
     | 
| 
       15 
75 
     | 
    
         | 
| 
       16 
76 
     | 
    
         
             
            #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
         
     | 
| 
      
 77 
     | 
    
         
            +
            #ifndef CC_TURING
         
     | 
| 
       17 
78 
     | 
    
         
             
            #define CC_TURING   700
         
     | 
| 
      
 79 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
            #if defined(GGML_USE_HIPBLAS)
         
     | 
| 
      
 82 
     | 
    
         
            +
            #define __CUDA_ARCH__ 1300
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
            typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
         
     | 
| 
      
 85 
     | 
    
         
            +
            static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
         
     | 
| 
      
 86 
     | 
    
         
            +
                const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
         
     | 
| 
      
 87 
     | 
    
         
            +
                const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
         
     | 
| 
      
 88 
     | 
    
         
            +
                const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
         
     | 
| 
      
 89 
     | 
    
         
            +
                return reinterpret_cast<const int&>(c);
         
     | 
| 
      
 90 
     | 
    
         
            +
            }
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
            static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
         
     | 
| 
      
 93 
     | 
    
         
            +
            #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
         
     | 
| 
      
 94 
     | 
    
         
            +
                c = __builtin_amdgcn_sdot4(a, b, c, false);
         
     | 
| 
      
 95 
     | 
    
         
            +
            #elif defined(__gfx1100__)
         
     | 
| 
      
 96 
     | 
    
         
            +
                c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
         
     | 
| 
      
 97 
     | 
    
         
            +
            #elif defined(__gfx1010__) || defined(__gfx900__)
         
     | 
| 
      
 98 
     | 
    
         
            +
                int tmp1;
         
     | 
| 
      
 99 
     | 
    
         
            +
                int tmp2;
         
     | 
| 
      
 100 
     | 
    
         
            +
                asm("\n \
         
     | 
| 
      
 101 
     | 
    
         
            +
                    v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
         
     | 
| 
      
 102 
     | 
    
         
            +
                    v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
         
     | 
| 
      
 103 
     | 
    
         
            +
                    v_add3_u32 %0, %1, %2, %0 \n \
         
     | 
| 
      
 104 
     | 
    
         
            +
                    v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
         
     | 
| 
      
 105 
     | 
    
         
            +
                    v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
         
     | 
| 
      
 106 
     | 
    
         
            +
                    v_add3_u32 %0, %1, %2, %0 \n \
         
     | 
| 
      
 107 
     | 
    
         
            +
                    "
         
     | 
| 
      
 108 
     | 
    
         
            +
                    : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
         
     | 
| 
      
 109 
     | 
    
         
            +
                    : "v"(a), "v"(b)
         
     | 
| 
      
 110 
     | 
    
         
            +
                );
         
     | 
| 
      
 111 
     | 
    
         
            +
            #else
         
     | 
| 
      
 112 
     | 
    
         
            +
                const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
         
     | 
| 
      
 113 
     | 
    
         
            +
                const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
         
     | 
| 
      
 114 
     | 
    
         
            +
                c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
         
     | 
| 
      
 115 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 116 
     | 
    
         
            +
                return c;
         
     | 
| 
      
 117 
     | 
    
         
            +
            }
         
     | 
| 
      
 118 
     | 
    
         
            +
            #endif
         
     | 
| 
       18 
119 
     | 
    
         | 
| 
       19 
120 
     | 
    
         
             
            #if defined(_MSC_VER)
         
     | 
| 
       20 
121 
     | 
    
         
             
            #pragma warning(disable: 4244 4267) // possible loss of data
         
     | 
| 
         @@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ 
     | 
|
| 
       259 
360 
     | 
    
         
             
            #define CUDA_CPY_BLOCK_SIZE 32
         
     | 
| 
       260 
361 
     | 
    
         
             
            #define CUDA_SCALE_BLOCK_SIZE 256
         
     | 
| 
       261 
362 
     | 
    
         
             
            #define CUDA_ROPE_BLOCK_SIZE 256
         
     | 
| 
      
 363 
     | 
    
         
            +
            #define CUDA_ALIBI_BLOCK_SIZE 32
         
     | 
| 
       262 
364 
     | 
    
         
             
            #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
         
     | 
| 
       263 
365 
     | 
    
         
             
            #define CUDA_QUANTIZE_BLOCK_SIZE 256
         
     | 
| 
       264 
366 
     | 
    
         
             
            #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
         
     | 
| 
         @@ -286,7 +388,7 @@ static int g_device_count = -1; 
     | 
|
| 
       286 
388 
     | 
    
         
             
            static int g_main_device = 0;
         
     | 
| 
       287 
389 
     | 
    
         
             
            static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
         
     | 
| 
       288 
390 
     | 
    
         
             
            static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
         
     | 
| 
       289 
     | 
    
         
            -
            static bool g_mul_mat_q =  
     | 
| 
      
 391 
     | 
    
         
            +
            static bool g_mul_mat_q = true;
         
     | 
| 
       290 
392 
     | 
    
         | 
| 
       291 
393 
     | 
    
         
             
            static void * g_scratch_buffer = nullptr;
         
     | 
| 
       292 
394 
     | 
    
         
             
            static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
         
     | 
| 
         @@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in 
     | 
|
| 
       423 
525 
     | 
    
         
             
            static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
         
     | 
| 
       424 
526 
     | 
    
         
             
                const block_q4_1 * x = (const block_q4_1 *) vx;
         
     | 
| 
       425 
527 
     | 
    
         | 
| 
       426 
     | 
    
         
            -
                const dfloat d = x[ib].dm 
     | 
| 
       427 
     | 
    
         
            -
                const dfloat m = x[ib].dm 
     | 
| 
      
 528 
     | 
    
         
            +
                const dfloat d = __low2half(x[ib].dm);
         
     | 
| 
      
 529 
     | 
    
         
            +
                const dfloat m = __high2half(x[ib].dm);
         
     | 
| 
       428 
530 
     | 
    
         | 
| 
       429 
531 
     | 
    
         
             
                const int vui = x[ib].qs[iqs];
         
     | 
| 
       430 
532 
     | 
    
         | 
| 
         @@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in 
     | 
|
| 
       466 
568 
     | 
    
         
             
            static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
         
     | 
| 
       467 
569 
     | 
    
         
             
                const block_q5_1 * x = (const block_q5_1 *) vx;
         
     | 
| 
       468 
570 
     | 
    
         | 
| 
       469 
     | 
    
         
            -
                const dfloat d = x[ib].dm 
     | 
| 
       470 
     | 
    
         
            -
                const dfloat m = x[ib].dm 
     | 
| 
      
 571 
     | 
    
         
            +
                const dfloat d = __low2half(x[ib].dm);
         
     | 
| 
      
 572 
     | 
    
         
            +
                const dfloat m = __high2half(x[ib].dm);
         
     | 
| 
       471 
573 
     | 
    
         | 
| 
       472 
574 
     | 
    
         
             
                uint32_t qh;
         
     | 
| 
       473 
575 
     | 
    
         
             
                memcpy(&qh, x[ib].qh, sizeof(qh));
         
     | 
| 
         @@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float 
     | 
|
| 
       519 
621 
     | 
    
         
             
                const uint8_t q = x[i].qs[32*n + l];
         
     | 
| 
       520 
622 
     | 
    
         
             
                float * y = yy + i*QK_K + 128*n;
         
     | 
| 
       521 
623 
     | 
    
         | 
| 
       522 
     | 
    
         
            -
                float dall = x[i].dm 
     | 
| 
       523 
     | 
    
         
            -
                float dmin = x[i].dm 
     | 
| 
      
 624 
     | 
    
         
            +
                float dall = __low2half(x[i].dm);
         
     | 
| 
      
 625 
     | 
    
         
            +
                float dmin = __high2half(x[i].dm);
         
     | 
| 
       524 
626 
     | 
    
         
             
                y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
         
     | 
| 
       525 
627 
     | 
    
         
             
                y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
         
     | 
| 
       526 
628 
     | 
    
         
             
                y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
         
     | 
| 
         @@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float 
     | 
|
| 
       530 
632 
     | 
    
         
             
                const int il = tid%16;  // 0...15
         
     | 
| 
       531 
633 
     | 
    
         
             
                const uint8_t q = x[i].qs[il] >> (2*is);
         
     | 
| 
       532 
634 
     | 
    
         
             
                float * y = yy + i*QK_K + 16*is + il;
         
     | 
| 
       533 
     | 
    
         
            -
                float dall = x[i].dm 
     | 
| 
       534 
     | 
    
         
            -
                float dmin = x[i].dm 
     | 
| 
      
 635 
     | 
    
         
            +
                float dall = __low2half(x[i].dm);
         
     | 
| 
      
 636 
     | 
    
         
            +
                float dmin = __high2half(x[i].dm);
         
     | 
| 
       535 
637 
     | 
    
         
             
                y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
         
     | 
| 
       536 
638 
     | 
    
         
             
                y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
         
     | 
| 
       537 
639 
     | 
    
         
             
            #endif
         
     | 
| 
         @@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float 
     | 
|
| 
       617 
719 
     | 
    
         | 
| 
       618 
720 
     | 
    
         
             
                float * y = yy + i*QK_K + 64*il + n*ir;
         
     | 
| 
       619 
721 
     | 
    
         | 
| 
       620 
     | 
    
         
            -
                const float dall = x[i].dm 
     | 
| 
       621 
     | 
    
         
            -
                const float dmin = x[i].dm 
     | 
| 
      
 722 
     | 
    
         
            +
                const float dall = __low2half(x[i].dm);
         
     | 
| 
      
 723 
     | 
    
         
            +
                const float dmin = __high2half(x[i].dm);
         
     | 
| 
       622 
724 
     | 
    
         | 
| 
       623 
725 
     | 
    
         
             
                const uint8_t * q = x[i].qs + 32*il + n*ir;
         
     | 
| 
       624 
726 
     | 
    
         | 
| 
         @@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float 
     | 
|
| 
       656 
758 
     | 
    
         | 
| 
       657 
759 
     | 
    
         
             
                float * y = yy + i*QK_K + 64*il + 2*ir;
         
     | 
| 
       658 
760 
     | 
    
         | 
| 
       659 
     | 
    
         
            -
                const float dall = x[i].dm 
     | 
| 
       660 
     | 
    
         
            -
                const float dmin = x[i].dm 
     | 
| 
      
 761 
     | 
    
         
            +
                const float dall = __low2half(x[i].dm);
         
     | 
| 
      
 762 
     | 
    
         
            +
                const float dmin = __high2half(x[i].dm);
         
     | 
| 
       661 
763 
     | 
    
         | 
| 
       662 
764 
     | 
    
         
             
                const uint8_t * ql = x[i].qs + 32*il + 2*ir;
         
     | 
| 
       663 
765 
     | 
    
         
             
                const uint8_t * qh = x[i].qh + 2*ir;
         
     | 
| 
         @@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, 
     | 
|
| 
       769 
871 
     | 
    
         
             
                    const float   * y = yy + i * QK_K + y_offset;
         
     | 
| 
       770 
872 
     | 
    
         
             
                    const uint8_t * q = x[i].qs + q_offset;
         
     | 
| 
       771 
873 
     | 
    
         | 
| 
       772 
     | 
    
         
            -
                    const float dall = x[i].dm 
     | 
| 
       773 
     | 
    
         
            -
                    const float dmin = x[i].dm 
     | 
| 
      
 874 
     | 
    
         
            +
                    const float dall = __low2half(x[i].dm);
         
     | 
| 
      
 875 
     | 
    
         
            +
                    const float dmin = __high2half(x[i].dm);
         
     | 
| 
       774 
876 
     | 
    
         | 
| 
       775 
877 
     | 
    
         
             
                    const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
         
     | 
| 
       776 
878 
     | 
    
         
             
                    aux[0] = a[0] & 0x0f0f0f0f;
         
     | 
| 
         @@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, 
     | 
|
| 
       990 
1092 
     | 
    
         
             
                    const float   * y1 = yy + i*QK_K + y_offset;
         
     | 
| 
       991 
1093 
     | 
    
         
             
                    const float   * y2 = y1 + 128;
         
     | 
| 
       992 
1094 
     | 
    
         | 
| 
       993 
     | 
    
         
            -
                    const float dall = x[i].dm 
     | 
| 
       994 
     | 
    
         
            -
                    const float dmin = x[i].dm 
     | 
| 
      
 1095 
     | 
    
         
            +
                    const float dall = __low2half(x[i].dm);
         
     | 
| 
      
 1096 
     | 
    
         
            +
                    const float dmin = __high2half(x[i].dm);
         
     | 
| 
       995 
1097 
     | 
    
         | 
| 
       996 
1098 
     | 
    
         
             
                    const uint16_t * a = (const uint16_t *)x[i].scales;
         
     | 
| 
       997 
1099 
     | 
    
         
             
                    aux[0] = a[im+0] & kmask1;
         
     | 
| 
         @@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, 
     | 
|
| 
       1123 
1225 
     | 
    
         
             
                    const float   * y1  = yy + i*QK_K + y_offset;
         
     | 
| 
       1124 
1226 
     | 
    
         
             
                    const float   * y2  = y1 + 128;
         
     | 
| 
       1125 
1227 
     | 
    
         | 
| 
       1126 
     | 
    
         
            -
                    const float dall = x[i].dm 
     | 
| 
       1127 
     | 
    
         
            -
                    const float dmin = x[i].dm 
     | 
| 
      
 1228 
     | 
    
         
            +
                    const float dall = __low2half(x[i].dm);
         
     | 
| 
      
 1229 
     | 
    
         
            +
                    const float dmin = __high2half(x[i].dm);
         
     | 
| 
       1128 
1230 
     | 
    
         | 
| 
       1129 
1231 
     | 
    
         
             
                    const uint16_t * a = (const uint16_t *)x[i].scales;
         
     | 
| 
       1130 
1232 
     | 
    
         
             
                    aux[0] = a[im+0] & kmask1;
         
     | 
| 
         @@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest 
     | 
|
| 
       1347 
1449 
     | 
    
         
             
                    return;
         
     | 
| 
       1348 
1450 
     | 
    
         
             
                }
         
     | 
| 
       1349 
1451 
     | 
    
         | 
| 
       1350 
     | 
    
         
            -
                y[ib].ds.x = d;
         
     | 
| 
       1351 
     | 
    
         
            -
                y[ib].ds.y = sum;
         
     | 
| 
      
 1452 
     | 
    
         
            +
                reinterpret_cast<half&>(y[ib].ds.x) = d;
         
     | 
| 
      
 1453 
     | 
    
         
            +
                reinterpret_cast<half&>(y[ib].ds.y) = sum;
         
     | 
| 
       1352 
1454 
     | 
    
         
             
            }
         
     | 
| 
       1353 
1455 
     | 
    
         | 
| 
       1354 
1456 
     | 
    
         
             
            template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
         
     | 
| 
         @@ -1399,6 +1501,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp 
     | 
|
| 
       1399 
1501 
     | 
    
         
             
                // second part effectively subtracts 8 from each quant value
         
     | 
| 
       1400 
1502 
     | 
    
         
             
                return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
         
     | 
| 
       1401 
1503 
     | 
    
         
             
            #else
         
     | 
| 
      
 1504 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1402 
1505 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1403 
1506 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1404 
1507 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1436,6 +1539,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp 
     | 
|
| 
       1436 
1539 
     | 
    
         
             
                // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
         
     | 
| 
       1437 
1540 
     | 
    
         
             
                return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
         
     | 
| 
       1438 
1541 
     | 
    
         
             
            #else
         
     | 
| 
      
 1542 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1439 
1543 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1440 
1544 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1441 
1545 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1471,6 +1575,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp 
     | 
|
| 
       1471 
1575 
     | 
    
         
             
                // second part effectively subtracts 16 from each quant value
         
     | 
| 
       1472 
1576 
     | 
    
         
             
                return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
         
     | 
| 
       1473 
1577 
     | 
    
         
             
            #else
         
     | 
| 
      
 1578 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1474 
1579 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1475 
1580 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1476 
1581 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1516,6 +1621,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp 
     | 
|
| 
       1516 
1621 
     | 
    
         
             
                return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
         
     | 
| 
       1517 
1622 
     | 
    
         | 
| 
       1518 
1623 
     | 
    
         
             
            #else
         
     | 
| 
      
 1624 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1519 
1625 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1520 
1626 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1521 
1627 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1537,6 +1643,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp 
     | 
|
| 
       1537 
1643 
     | 
    
         | 
| 
       1538 
1644 
     | 
    
         
             
                return d8_0*d8_1 * sumi;
         
     | 
| 
       1539 
1645 
     | 
    
         
             
            #else
         
     | 
| 
      
 1646 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1540 
1647 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1541 
1648 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1542 
1649 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1567,6 +1674,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp 
     | 
|
| 
       1567 
1674 
     | 
    
         
             
                // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
         
     | 
| 
       1568 
1675 
     | 
    
         
             
                return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
         
     | 
| 
       1569 
1676 
     | 
    
         
             
            #else
         
     | 
| 
      
 1677 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1570 
1678 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1571 
1679 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1572 
1680 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1602,6 +1710,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( 
     | 
|
| 
       1602 
1710 
     | 
    
         | 
| 
       1603 
1711 
     | 
    
         
             
                return dm2f.x*sumf_d - dm2f.y*sumf_m;
         
     | 
| 
       1604 
1712 
     | 
    
         
             
            #else
         
     | 
| 
      
 1713 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1605 
1714 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1606 
1715 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1607 
1716 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1639,6 +1748,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( 
     | 
|
| 
       1639 
1748 
     | 
    
         | 
| 
       1640 
1749 
     | 
    
         
             
                return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
         
     | 
| 
       1641 
1750 
     | 
    
         
             
            #else
         
     | 
| 
      
 1751 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1642 
1752 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1643 
1753 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1644 
1754 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1679,6 +1789,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( 
     | 
|
| 
       1679 
1789 
     | 
    
         | 
| 
       1680 
1790 
     | 
    
         
             
                return d3 * sumf;
         
     | 
| 
       1681 
1791 
     | 
    
         
             
            #else
         
     | 
| 
      
 1792 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1682 
1793 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1683 
1794 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1684 
1795 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1704,6 +1815,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( 
     | 
|
| 
       1704 
1815 
     | 
    
         | 
| 
       1705 
1816 
     | 
    
         
             
                return d3*d8 * sumi;
         
     | 
| 
       1706 
1817 
     | 
    
         
             
            #else
         
     | 
| 
      
 1818 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1707 
1819 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1708 
1820 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1709 
1821 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1737,12 +1849,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( 
     | 
|
| 
       1737 
1849 
     | 
    
         
             
                return dm4f.x*sumf_d - dm4f.y*sumf_m;
         
     | 
| 
       1738 
1850 
     | 
    
         | 
| 
       1739 
1851 
     | 
    
         
             
            #else
         
     | 
| 
      
 1852 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1740 
1853 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1741 
1854 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1742 
1855 
     | 
    
         
             
            }
         
     | 
| 
       1743 
1856 
     | 
    
         | 
| 
       1744 
1857 
     | 
    
         
             
            // contiguous u/y values
         
     | 
| 
       1745 
     | 
    
         
            -
            // also used for q5_K
         
     | 
| 
       1746 
1858 
     | 
    
         
             
            static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
         
     | 
| 
       1747 
1859 
     | 
    
         
             
                const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
         
     | 
| 
       1748 
1860 
     | 
    
         
             
                const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
         
     | 
| 
         @@ -1752,19 +1864,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( 
     | 
|
| 
       1752 
1864 
     | 
    
         
             
                float sumf_m = 0.0f;
         
     | 
| 
       1753 
1865 
     | 
    
         | 
| 
       1754 
1866 
     | 
    
         
             
            #pragma unroll
         
     | 
| 
       1755 
     | 
    
         
            -
                for (int  
     | 
| 
      
 1867 
     | 
    
         
            +
                for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
         
     | 
| 
       1756 
1868 
     | 
    
         
             
                    int sumi_d = 0;
         
     | 
| 
       1757 
1869 
     | 
    
         | 
| 
       1758 
1870 
     | 
    
         
             
            #pragma unroll
         
     | 
| 
       1759 
     | 
    
         
            -
                    for (int  
     | 
| 
       1760 
     | 
    
         
            -
                        sumi_d = __dp4a(v[ 
     | 
| 
       1761 
     | 
    
         
            -
                        sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
         
     | 
| 
      
 1871 
     | 
    
         
            +
                    for (int j = 0; j < QI8_1; ++j) {
         
     | 
| 
      
 1872 
     | 
    
         
            +
                        sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
         
     | 
| 
       1762 
1873 
     | 
    
         
             
                    }
         
     | 
| 
       1763 
1874 
     | 
    
         | 
| 
       1764 
     | 
    
         
            -
                    const float2 ds8f = __half22float2(ds8[ 
     | 
| 
      
 1875 
     | 
    
         
            +
                    const float2 ds8f = __half22float2(ds8[i]);
         
     | 
| 
       1765 
1876 
     | 
    
         | 
| 
       1766 
     | 
    
         
            -
                    sumf_d += ds8f.x * (sc[ 
     | 
| 
       1767 
     | 
    
         
            -
                    sumf_m += ds8f.y *   m[ 
     | 
| 
      
 1877 
     | 
    
         
            +
                    sumf_d += ds8f.x * (sc[i] * sumi_d);
         
     | 
| 
      
 1878 
     | 
    
         
            +
                    sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
         
     | 
| 
       1768 
1879 
     | 
    
         
             
                }
         
     | 
| 
       1769 
1880 
     | 
    
         | 
| 
       1770 
1881 
     | 
    
         
             
                const float2 dm4f = __half22float2(dm4);
         
     | 
| 
         @@ -1772,6 +1883,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( 
     | 
|
| 
       1772 
1883 
     | 
    
         
             
                return dm4f.x*sumf_d - dm4f.y*sumf_m;
         
     | 
| 
       1773 
1884 
     | 
    
         | 
| 
       1774 
1885 
     | 
    
         
             
            #else
         
     | 
| 
      
 1886 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1775 
1887 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1776 
1888 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1777 
1889 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1780,7 +1892,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( 
     | 
|
| 
       1780 
1892 
     | 
    
         
             
            #define VDR_Q5_K_Q8_1_MMQ  8
         
     | 
| 
       1781 
1893 
     | 
    
         | 
| 
       1782 
1894 
     | 
    
         
             
            // contiguous v/x values
         
     | 
| 
       1783 
     | 
    
         
            -
            static __device__ __forceinline__ float  
     | 
| 
      
 1895 
     | 
    
         
            +
            static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
         
     | 
| 
       1784 
1896 
     | 
    
         
             
                const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
         
     | 
| 
       1785 
1897 
     | 
    
         
             
                const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
         
     | 
| 
       1786 
1898 
     | 
    
         | 
| 
         @@ -1812,6 +1924,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( 
     | 
|
| 
       1812 
1924 
     | 
    
         
             
                return dm5f.x*sumf_d - dm5f.y*sumf_m;
         
     | 
| 
       1813 
1925 
     | 
    
         | 
| 
       1814 
1926 
     | 
    
         
             
            #else
         
     | 
| 
      
 1927 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 1928 
     | 
    
         
            +
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
      
 1929 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 1930 
     | 
    
         
            +
            }
         
     | 
| 
      
 1931 
     | 
    
         
            +
             
     | 
| 
      
 1932 
     | 
    
         
            +
            // contiguous u/y values
         
     | 
| 
      
 1933 
     | 
    
         
            +
            static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
         
     | 
| 
      
 1934 
     | 
    
         
            +
                const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
         
     | 
| 
      
 1935 
     | 
    
         
            +
                const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
         
     | 
| 
      
 1936 
     | 
    
         
            +
             
     | 
| 
      
 1937 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
         
     | 
| 
      
 1938 
     | 
    
         
            +
                float sumf_d = 0.0f;
         
     | 
| 
      
 1939 
     | 
    
         
            +
                float sumf_m = 0.0f;
         
     | 
| 
      
 1940 
     | 
    
         
            +
             
     | 
| 
      
 1941 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 1942 
     | 
    
         
            +
                for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
         
     | 
| 
      
 1943 
     | 
    
         
            +
                    int sumi_d = 0;
         
     | 
| 
      
 1944 
     | 
    
         
            +
             
     | 
| 
      
 1945 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 1946 
     | 
    
         
            +
                    for (int j = 0; j < QI8_1; ++j) {
         
     | 
| 
      
 1947 
     | 
    
         
            +
                        sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
         
     | 
| 
      
 1948 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1949 
     | 
    
         
            +
             
     | 
| 
      
 1950 
     | 
    
         
            +
                    const float2 ds8f = __half22float2(ds8[i]);
         
     | 
| 
      
 1951 
     | 
    
         
            +
             
     | 
| 
      
 1952 
     | 
    
         
            +
                    sumf_d += ds8f.x * (sc[i] * sumi_d);
         
     | 
| 
      
 1953 
     | 
    
         
            +
                    sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
         
     | 
| 
      
 1954 
     | 
    
         
            +
                }
         
     | 
| 
      
 1955 
     | 
    
         
            +
             
     | 
| 
      
 1956 
     | 
    
         
            +
                const float2 dm4f = __half22float2(dm4);
         
     | 
| 
      
 1957 
     | 
    
         
            +
             
     | 
| 
      
 1958 
     | 
    
         
            +
                return dm4f.x*sumf_d - dm4f.y*sumf_m;
         
     | 
| 
      
 1959 
     | 
    
         
            +
             
     | 
| 
      
 1960 
     | 
    
         
            +
            #else
         
     | 
| 
      
 1961 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1815 
1962 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1816 
1963 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1817 
1964 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1842,6 +1989,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( 
     | 
|
| 
       1842 
1989 
     | 
    
         | 
| 
       1843 
1990 
     | 
    
         
             
                return d*sumf;
         
     | 
| 
       1844 
1991 
     | 
    
         
             
            #else
         
     | 
| 
      
 1992 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1845 
1993 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1846 
1994 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1847 
1995 
     | 
    
         
             
            }
         
     | 
| 
         @@ -1873,6 +2021,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( 
     | 
|
| 
       1873 
2021 
     | 
    
         
             
                return d6 * sumf_d;
         
     | 
| 
       1874 
2022 
     | 
    
         | 
| 
       1875 
2023 
     | 
    
         
             
            #else
         
     | 
| 
      
 2024 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       1876 
2025 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       1877 
2026 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       1878 
2027 
     | 
    
         
             
            }
         
     | 
| 
         @@ -2298,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( 
     | 
|
| 
       2298 
2447 
     | 
    
         
             
                    u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
         
     | 
| 
       2299 
2448 
     | 
    
         
             
                }
         
     | 
| 
       2300 
2449 
     | 
    
         | 
| 
       2301 
     | 
    
         
            -
                return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds 
     | 
| 
      
 2450 
     | 
    
         
            +
                return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
         
     | 
| 
       2302 
2451 
     | 
    
         
             
            }
         
     | 
| 
       2303 
2452 
     | 
    
         | 
| 
       2304 
2453 
     | 
    
         
             
            template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
         
     | 
| 
         @@ -2384,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( 
     | 
|
| 
       2384 
2533 
     | 
    
         
             
            #pragma unroll
         
     | 
| 
       2385 
2534 
     | 
    
         
             
                for (int i = 0; i < QR2_K; ++ i) {
         
     | 
| 
       2386 
2535 
     | 
    
         
             
                    u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
         
     | 
| 
       2387 
     | 
    
         
            -
                    d8[i] = bq8_1[bq8_offset + i].ds 
     | 
| 
      
 2536 
     | 
    
         
            +
                    d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
         
     | 
| 
       2388 
2537 
     | 
    
         
             
                }
         
     | 
| 
       2389 
2538 
     | 
    
         | 
| 
       2390 
2539 
     | 
    
         
             
                return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
         
     | 
| 
         @@ -2503,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( 
     | 
|
| 
       2503 
2652 
     | 
    
         
             
            #pragma unroll
         
     | 
| 
       2504 
2653 
     | 
    
         
             
                for (int i = 0; i < QR3_K; ++i) {
         
     | 
| 
       2505 
2654 
     | 
    
         
             
                    u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
         
     | 
| 
       2506 
     | 
    
         
            -
                    d8[i] = bq8_1[bq8_offset + i].ds 
     | 
| 
      
 2655 
     | 
    
         
            +
                    d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
         
     | 
| 
       2507 
2656 
     | 
    
         
             
                }
         
     | 
| 
       2508 
2657 
     | 
    
         | 
| 
       2509 
2658 
     | 
    
         
             
                return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
         
     | 
| 
         @@ -2672,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( 
     | 
|
| 
       2672 
2821 
     | 
    
         | 
| 
       2673 
2822 
     | 
    
         
             
                for (int i = 0; i < QR4_K; ++i) {
         
     | 
| 
       2674 
2823 
     | 
    
         
             
                    const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
         
     | 
| 
       2675 
     | 
    
         
            -
                    d8[i] = bq8i->ds 
     | 
| 
      
 2824 
     | 
    
         
            +
                    d8[i] = __low2half(bq8i->ds);
         
     | 
| 
       2676 
2825 
     | 
    
         | 
| 
       2677 
2826 
     | 
    
         
             
                    const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
         
     | 
| 
       2678 
2827 
     | 
    
         
             
                    u[2*i+0] = q8[0];
         
     | 
| 
         @@ -2699,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( 
     | 
|
| 
       2699 
2848 
     | 
    
         
             
                const float dall = bq4_K->d[0];
         
     | 
| 
       2700 
2849 
     | 
    
         
             
                const float dmin = bq4_K->d[1];
         
     | 
| 
       2701 
2850 
     | 
    
         | 
| 
       2702 
     | 
    
         
            -
                const float d8_1 = bq8_1[0].ds 
     | 
| 
       2703 
     | 
    
         
            -
                const float d8_2 = bq8_1[1].ds 
     | 
| 
      
 2851 
     | 
    
         
            +
                const float d8_1 = __low2float(bq8_1[0].ds);
         
     | 
| 
      
 2852 
     | 
    
         
            +
                const float d8_2 = __low2float(bq8_1[1].ds);
         
     | 
| 
       2704 
2853 
     | 
    
         | 
| 
       2705 
2854 
     | 
    
         
             
                const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
         
     | 
| 
       2706 
2855 
     | 
    
         
             
                const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
         
     | 
| 
         @@ -2722,6 +2871,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( 
     | 
|
| 
       2722 
2871 
     | 
    
         
             
                return dall * sumf_d - dmin * sumf_m;
         
     | 
| 
       2723 
2872 
     | 
    
         | 
| 
       2724 
2873 
     | 
    
         
             
            #else
         
     | 
| 
      
 2874 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       2725 
2875 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       2726 
2876 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       2727 
2877 
     | 
    
         | 
| 
         @@ -2808,18 +2958,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( 
     | 
|
| 
       2808 
2958 
     | 
    
         
             
                const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
         
     | 
| 
       2809 
2959 
     | 
    
         
             
                const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
         
     | 
| 
       2810 
2960 
     | 
    
         | 
| 
       2811 
     | 
    
         
            -
                int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
         
     | 
| 
       2812 
     | 
    
         
            -
             
     | 
| 
       2813 
     | 
    
         
            -
            #pragma unroll
         
     | 
| 
       2814 
     | 
    
         
            -
                for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
         
     | 
| 
       2815 
     | 
    
         
            -
                    v[l + 0]         = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
         
     | 
| 
       2816 
     | 
    
         
            -
                    v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
         
     | 
| 
       2817 
     | 
    
         
            -
                }
         
     | 
| 
       2818 
     | 
    
         
            -
             
     | 
| 
       2819 
2961 
     | 
    
         
             
                const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
         
     | 
| 
       2820 
2962 
     | 
    
         | 
| 
       2821 
2963 
     | 
    
         
             
                const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
         
     | 
| 
       2822 
     | 
    
         
            -
                return vec_dot_q4_K_q8_1_impl_mmq( 
     | 
| 
      
 2964 
     | 
    
         
            +
                return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
         
     | 
| 
      
 2965 
     | 
    
         
            +
                                                  x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
         
     | 
| 
       2823 
2966 
     | 
    
         
             
            }
         
     | 
| 
       2824 
2967 
     | 
    
         | 
| 
       2825 
2968 
     | 
    
         
             
            static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
         
     | 
| 
         @@ -2859,14 +3002,14 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( 
     | 
|
| 
       2859 
3002 
     | 
    
         
             
            #pragma unroll
         
     | 
| 
       2860 
3003 
     | 
    
         
             
                for (int i = 0; i < QR5_K; ++i) {
         
     | 
| 
       2861 
3004 
     | 
    
         
             
                    const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
         
     | 
| 
       2862 
     | 
    
         
            -
                    d8[i] = bq8i->ds 
     | 
| 
      
 3005 
     | 
    
         
            +
                    d8[i] = __low2float(bq8i->ds);
         
     | 
| 
       2863 
3006 
     | 
    
         | 
| 
       2864 
3007 
     | 
    
         
             
                    const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
         
     | 
| 
       2865 
3008 
     | 
    
         
             
                    u[2*i+0] = q8[0];
         
     | 
| 
       2866 
3009 
     | 
    
         
             
                    u[2*i+1] = q8[4];
         
     | 
| 
       2867 
3010 
     | 
    
         
             
                }
         
     | 
| 
       2868 
3011 
     | 
    
         | 
| 
       2869 
     | 
    
         
            -
                return  
     | 
| 
      
 3012 
     | 
    
         
            +
                return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
         
     | 
| 
       2870 
3013 
     | 
    
         | 
| 
       2871 
3014 
     | 
    
         
             
            #else
         
     | 
| 
       2872 
3015 
     | 
    
         | 
| 
         @@ -2877,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( 
     | 
|
| 
       2877 
3020 
     | 
    
         | 
| 
       2878 
3021 
     | 
    
         
             
                const float d = bq5_K->d;
         
     | 
| 
       2879 
3022 
     | 
    
         | 
| 
       2880 
     | 
    
         
            -
                const float d8_1 = bq8_1[0].ds 
     | 
| 
       2881 
     | 
    
         
            -
                const float d8_2 = bq8_1[1].ds 
     | 
| 
      
 3023 
     | 
    
         
            +
                const float d8_1 = __low2half(bq8_1[0].ds);
         
     | 
| 
      
 3024 
     | 
    
         
            +
                const float d8_2 = __low2half(bq8_1[1].ds);
         
     | 
| 
       2882 
3025 
     | 
    
         | 
| 
       2883 
3026 
     | 
    
         
             
                const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
         
     | 
| 
       2884 
3027 
     | 
    
         
             
                const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
         
     | 
| 
         @@ -2905,6 +3048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( 
     | 
|
| 
       2905 
3048 
     | 
    
         
             
                return d * sumf_d;
         
     | 
| 
       2906 
3049 
     | 
    
         | 
| 
       2907 
3050 
     | 
    
         
             
            #else
         
     | 
| 
      
 3051 
     | 
    
         
            +
                assert(false);
         
     | 
| 
       2908 
3052 
     | 
    
         
             
                return 0.0f; // only to satisfy the compiler
         
     | 
| 
       2909 
3053 
     | 
    
         
             
            #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
       2910 
3054 
     | 
    
         | 
| 
         @@ -3008,7 +3152,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( 
     | 
|
| 
       3008 
3152 
     | 
    
         | 
| 
       3009 
3153 
     | 
    
         
             
                const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
         
     | 
| 
       3010 
3154 
     | 
    
         
             
                const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
         
     | 
| 
       3011 
     | 
    
         
            -
                return  
     | 
| 
      
 3155 
     | 
    
         
            +
                return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
         
     | 
| 
      
 3156 
     | 
    
         
            +
                                                  x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
         
     | 
| 
       3012 
3157 
     | 
    
         
             
            }
         
     | 
| 
       3013 
3158 
     | 
    
         | 
| 
       3014 
3159 
     | 
    
         
             
            static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
         
     | 
| 
         @@ -3031,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( 
     | 
|
| 
       3031 
3176 
     | 
    
         
             
            #pragma unroll
         
     | 
| 
       3032 
3177 
     | 
    
         
             
                for (int i = 0; i < QR6_K; ++i) {
         
     | 
| 
       3033 
3178 
     | 
    
         
             
                    u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
         
     | 
| 
       3034 
     | 
    
         
            -
                    d8[i] = bq8_1[bq8_offset + 2*i].ds 
     | 
| 
      
 3179 
     | 
    
         
            +
                    d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
         
     | 
| 
       3035 
3180 
     | 
    
         
             
                }
         
     | 
| 
       3036 
3181 
     | 
    
         | 
| 
       3037 
3182 
     | 
    
         
             
                return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
         
     | 
| 
         @@ -3135,7 +3280,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( 
     | 
|
| 
       3135 
3280 
     | 
    
         | 
| 
       3136 
3281 
     | 
    
         
             
            template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
         
     | 
| 
       3137 
3282 
     | 
    
         
             
                          allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
         
     | 
| 
       3138 
     | 
    
         
            -
            static  
     | 
| 
      
 3283 
     | 
    
         
            +
            static __device__ __forceinline__ void mul_mat_q(
         
     | 
| 
       3139 
3284 
     | 
    
         
             
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
       3140 
3285 
     | 
    
         
             
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
       3141 
3286 
     | 
    
         | 
| 
         @@ -3150,7 +3295,6 @@ static __global__ void mul_mat_q( 
     | 
|
| 
       3150 
3295 
     | 
    
         | 
| 
       3151 
3296 
     | 
    
         
             
                const int row_dst_0 = blockIdx.x*mmq_y;
         
     | 
| 
       3152 
3297 
     | 
    
         
             
                const int & row_x_0 = row_dst_0;
         
     | 
| 
       3153 
     | 
    
         
            -
                const int row_dst = row_dst_0 + threadIdx.x;
         
     | 
| 
       3154 
3298 
     | 
    
         | 
| 
       3155 
3299 
     | 
    
         
             
                const int col_dst_0 = blockIdx.y*mmq_x;
         
     | 
| 
       3156 
3300 
     | 
    
         
             
                const int & col_y_0 = col_dst_0;
         
     | 
| 
         @@ -3200,7 +3344,7 @@ static __global__ void mul_mat_q( 
     | 
|
| 
       3200 
3344 
     | 
    
         
             
                                *dsi_dst = *dsi_src;
         
     | 
| 
       3201 
3345 
     | 
    
         
             
                            } else {
         
     | 
| 
       3202 
3346 
     | 
    
         
             
                                float * dfi_dst = (float *) dsi_dst;
         
     | 
| 
       3203 
     | 
    
         
            -
                                *dfi_dst = (*dsi_src) 
     | 
| 
      
 3347 
     | 
    
         
            +
                                *dfi_dst = __low2half(*dsi_src);
         
     | 
| 
       3204 
3348 
     | 
    
         
             
                            }
         
     | 
| 
       3205 
3349 
     | 
    
         
             
                        }
         
     | 
| 
       3206 
3350 
     | 
    
         | 
| 
         @@ -3223,11 +3367,7 @@ static __global__ void mul_mat_q( 
     | 
|
| 
       3223 
3367 
     | 
    
         
             
                    }
         
     | 
| 
       3224 
3368 
     | 
    
         
             
                }
         
     | 
| 
       3225 
3369 
     | 
    
         | 
| 
       3226 
     | 
    
         
            -
             
     | 
| 
       3227 
     | 
    
         
            -
                if (row_dst >= nrows_dst) {
         
     | 
| 
       3228 
     | 
    
         
            -
                    return;
         
     | 
| 
       3229 
     | 
    
         
            -
                }
         
     | 
| 
       3230 
     | 
    
         
            -
             
     | 
| 
      
 3370 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
       3231 
3371 
     | 
    
         
             
                for (int j = 0; j < mmq_x; j += nwarps) {
         
     | 
| 
       3232 
3372 
     | 
    
         
             
                    const int col_dst = col_dst_0 + j + threadIdx.y;
         
     | 
| 
       3233 
3373 
     | 
    
         | 
| 
         @@ -3235,12 +3375,375 @@ static __global__ void mul_mat_q( 
     | 
|
| 
       3235 
3375 
     | 
    
         
             
                        return;
         
     | 
| 
       3236 
3376 
     | 
    
         
             
                    }
         
     | 
| 
       3237 
3377 
     | 
    
         | 
| 
      
 3378 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
       3238 
3379 
     | 
    
         
             
                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
         
     | 
| 
       3239 
     | 
    
         
            -
                         
     | 
| 
      
 3380 
     | 
    
         
            +
                        const int row_dst = row_dst_0 + threadIdx.x + i;
         
     | 
| 
      
 3381 
     | 
    
         
            +
             
     | 
| 
      
 3382 
     | 
    
         
            +
                        if (row_dst >= nrows_dst) {
         
     | 
| 
      
 3383 
     | 
    
         
            +
                            continue;
         
     | 
| 
      
 3384 
     | 
    
         
            +
                        }
         
     | 
| 
      
 3385 
     | 
    
         
            +
             
     | 
| 
      
 3386 
     | 
    
         
            +
                        dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
         
     | 
| 
       3240 
3387 
     | 
    
         
             
                    }
         
     | 
| 
       3241 
3388 
     | 
    
         
             
                }
         
     | 
| 
       3242 
3389 
     | 
    
         
             
            }
         
     | 
| 
       3243 
3390 
     | 
    
         | 
| 
      
 3391 
     | 
    
         
            +
            #define  MMQ_X_Q4_0_AMPERE 64
         
     | 
| 
      
 3392 
     | 
    
         
            +
            #define  MMQ_Y_Q4_0_AMPERE 128
         
     | 
| 
      
 3393 
     | 
    
         
            +
            #define NWARPS_Q4_0_AMPERE 4
         
     | 
| 
      
 3394 
     | 
    
         
            +
            #define  MMQ_X_Q4_0_PASCAL 64
         
     | 
| 
      
 3395 
     | 
    
         
            +
            #define  MMQ_Y_Q4_0_PASCAL 64
         
     | 
| 
      
 3396 
     | 
    
         
            +
            #define NWARPS_Q4_0_PASCAL 8
         
     | 
| 
      
 3397 
     | 
    
         
            +
             
     | 
| 
      
 3398 
     | 
    
         
            +
            template <bool need_check> static __global__ void mul_mat_q4_0(
         
     | 
| 
      
 3399 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3400 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3401 
     | 
    
         
            +
             
     | 
| 
      
 3402 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3403 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
         
     | 
| 
      
 3404 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
         
     | 
| 
      
 3405 
     | 
    
         
            +
                const int nwarps = NWARPS_Q4_0_AMPERE;
         
     | 
| 
      
 3406 
     | 
    
         
            +
             
     | 
| 
      
 3407 
     | 
    
         
            +
                mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         
     | 
| 
      
 3408 
     | 
    
         
            +
                    load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
         
     | 
| 
      
 3409 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3410 
     | 
    
         
            +
             
     | 
| 
      
 3411 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3412 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
         
     | 
| 
      
 3413 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
         
     | 
| 
      
 3414 
     | 
    
         
            +
                const int nwarps = NWARPS_Q4_0_PASCAL;
         
     | 
| 
      
 3415 
     | 
    
         
            +
             
     | 
| 
      
 3416 
     | 
    
         
            +
                mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         
     | 
| 
      
 3417 
     | 
    
         
            +
                    load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
         
     | 
| 
      
 3418 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3419 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3420 
     | 
    
         
            +
                (void) vec_dot_q4_0_q8_1_mul_mat;
         
     | 
| 
      
 3421 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3422 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3423 
     | 
    
         
            +
            }
         
     | 
| 
      
 3424 
     | 
    
         
            +
             
     | 
| 
      
 3425 
     | 
    
         
            +
            #define  MMQ_X_Q4_1_AMPERE 64
         
     | 
| 
      
 3426 
     | 
    
         
            +
            #define  MMQ_Y_Q4_1_AMPERE 128
         
     | 
| 
      
 3427 
     | 
    
         
            +
            #define NWARPS_Q4_1_AMPERE 4
         
     | 
| 
      
 3428 
     | 
    
         
            +
            #define  MMQ_X_Q4_1_PASCAL 64
         
     | 
| 
      
 3429 
     | 
    
         
            +
            #define  MMQ_Y_Q4_1_PASCAL 64
         
     | 
| 
      
 3430 
     | 
    
         
            +
            #define NWARPS_Q4_1_PASCAL 8
         
     | 
| 
      
 3431 
     | 
    
         
            +
             
     | 
| 
      
 3432 
     | 
    
         
            +
            template <bool need_check> static __global__ void
         
     | 
| 
      
 3433 
     | 
    
         
            +
            #if __CUDA_ARCH__ < CC_TURING
         
     | 
| 
      
 3434 
     | 
    
         
            +
                __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
         
     | 
| 
      
 3435 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ < CC_TURING
         
     | 
| 
      
 3436 
     | 
    
         
            +
                mul_mat_q4_1(
         
     | 
| 
      
 3437 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3438 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3439 
     | 
    
         
            +
             
     | 
| 
      
 3440 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3441 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
         
     | 
| 
      
 3442 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
         
     | 
| 
      
 3443 
     | 
    
         
            +
                const int nwarps = NWARPS_Q4_1_AMPERE;
         
     | 
| 
      
 3444 
     | 
    
         
            +
             
     | 
| 
      
 3445 
     | 
    
         
            +
                mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         
     | 
| 
      
 3446 
     | 
    
         
            +
                    load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
         
     | 
| 
      
 3447 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3448 
     | 
    
         
            +
             
     | 
| 
      
 3449 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3450 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
         
     | 
| 
      
 3451 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
         
     | 
| 
      
 3452 
     | 
    
         
            +
                const int nwarps = NWARPS_Q4_1_PASCAL;
         
     | 
| 
      
 3453 
     | 
    
         
            +
             
     | 
| 
      
 3454 
     | 
    
         
            +
                mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         
     | 
| 
      
 3455 
     | 
    
         
            +
                    load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
         
     | 
| 
      
 3456 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3457 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3458 
     | 
    
         
            +
                (void) vec_dot_q4_1_q8_1_mul_mat;
         
     | 
| 
      
 3459 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3460 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3461 
     | 
    
         
            +
            }
         
     | 
| 
      
 3462 
     | 
    
         
            +
             
     | 
| 
      
 3463 
     | 
    
         
            +
            #define  MMQ_X_Q5_0_AMPERE 128
         
     | 
| 
      
 3464 
     | 
    
         
            +
            #define  MMQ_Y_Q5_0_AMPERE 64
         
     | 
| 
      
 3465 
     | 
    
         
            +
            #define NWARPS_Q5_0_AMPERE 4
         
     | 
| 
      
 3466 
     | 
    
         
            +
            #define  MMQ_X_Q5_0_PASCAL 64
         
     | 
| 
      
 3467 
     | 
    
         
            +
            #define  MMQ_Y_Q5_0_PASCAL 64
         
     | 
| 
      
 3468 
     | 
    
         
            +
            #define NWARPS_Q5_0_PASCAL 8
         
     | 
| 
      
 3469 
     | 
    
         
            +
             
     | 
| 
      
 3470 
     | 
    
         
            +
            template <bool need_check> static __global__ void mul_mat_q5_0(
         
     | 
| 
      
 3471 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3472 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3473 
     | 
    
         
            +
             
     | 
| 
      
 3474 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3475 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
         
     | 
| 
      
 3476 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
         
     | 
| 
      
 3477 
     | 
    
         
            +
                const int nwarps = NWARPS_Q5_0_AMPERE;
         
     | 
| 
      
 3478 
     | 
    
         
            +
             
     | 
| 
      
 3479 
     | 
    
         
            +
                mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         
     | 
| 
      
 3480 
     | 
    
         
            +
                    load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
         
     | 
| 
      
 3481 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3482 
     | 
    
         
            +
             
     | 
| 
      
 3483 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3484 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
         
     | 
| 
      
 3485 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
         
     | 
| 
      
 3486 
     | 
    
         
            +
                const int nwarps = NWARPS_Q5_0_PASCAL;
         
     | 
| 
      
 3487 
     | 
    
         
            +
             
     | 
| 
      
 3488 
     | 
    
         
            +
                mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         
     | 
| 
      
 3489 
     | 
    
         
            +
                    load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
         
     | 
| 
      
 3490 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3491 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3492 
     | 
    
         
            +
                (void) vec_dot_q5_0_q8_1_mul_mat;
         
     | 
| 
      
 3493 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3494 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3495 
     | 
    
         
            +
            }
         
     | 
| 
      
 3496 
     | 
    
         
            +
             
     | 
| 
      
 3497 
     | 
    
         
            +
            #define  MMQ_X_Q5_1_AMPERE 128
         
     | 
| 
      
 3498 
     | 
    
         
            +
            #define  MMQ_Y_Q5_1_AMPERE 64
         
     | 
| 
      
 3499 
     | 
    
         
            +
            #define NWARPS_Q5_1_AMPERE 4
         
     | 
| 
      
 3500 
     | 
    
         
            +
            #define  MMQ_X_Q5_1_PASCAL 64
         
     | 
| 
      
 3501 
     | 
    
         
            +
            #define  MMQ_Y_Q5_1_PASCAL 64
         
     | 
| 
      
 3502 
     | 
    
         
            +
            #define NWARPS_Q5_1_PASCAL 8
         
     | 
| 
      
 3503 
     | 
    
         
            +
             
     | 
| 
      
 3504 
     | 
    
         
            +
            template <bool need_check> static __global__ void mul_mat_q5_1(
         
     | 
| 
      
 3505 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3506 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3507 
     | 
    
         
            +
             
     | 
| 
      
 3508 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3509 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
         
     | 
| 
      
 3510 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
         
     | 
| 
      
 3511 
     | 
    
         
            +
                const int nwarps = NWARPS_Q5_1_AMPERE;
         
     | 
| 
      
 3512 
     | 
    
         
            +
             
     | 
| 
      
 3513 
     | 
    
         
            +
                mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         
     | 
| 
      
 3514 
     | 
    
         
            +
                    load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
         
     | 
| 
      
 3515 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3516 
     | 
    
         
            +
             
     | 
| 
      
 3517 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3518 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
         
     | 
| 
      
 3519 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
         
     | 
| 
      
 3520 
     | 
    
         
            +
                const int nwarps = NWARPS_Q5_1_PASCAL;
         
     | 
| 
      
 3521 
     | 
    
         
            +
             
     | 
| 
      
 3522 
     | 
    
         
            +
                mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         
     | 
| 
      
 3523 
     | 
    
         
            +
                    load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
         
     | 
| 
      
 3524 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3525 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3526 
     | 
    
         
            +
                (void) vec_dot_q5_1_q8_1_mul_mat;
         
     | 
| 
      
 3527 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3528 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3529 
     | 
    
         
            +
            }
         
     | 
| 
      
 3530 
     | 
    
         
            +
             
     | 
| 
      
 3531 
     | 
    
         
            +
            #define  MMQ_X_Q8_0_AMPERE 128
         
     | 
| 
      
 3532 
     | 
    
         
            +
            #define  MMQ_Y_Q8_0_AMPERE 64
         
     | 
| 
      
 3533 
     | 
    
         
            +
            #define NWARPS_Q8_0_AMPERE 4
         
     | 
| 
      
 3534 
     | 
    
         
            +
            #define  MMQ_X_Q8_0_PASCAL 64
         
     | 
| 
      
 3535 
     | 
    
         
            +
            #define  MMQ_Y_Q8_0_PASCAL 64
         
     | 
| 
      
 3536 
     | 
    
         
            +
            #define NWARPS_Q8_0_PASCAL 8
         
     | 
| 
      
 3537 
     | 
    
         
            +
             
     | 
| 
      
 3538 
     | 
    
         
            +
            template <bool need_check> static __global__ void mul_mat_q8_0(
         
     | 
| 
      
 3539 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3540 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3541 
     | 
    
         
            +
             
     | 
| 
      
 3542 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3543 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
         
     | 
| 
      
 3544 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
         
     | 
| 
      
 3545 
     | 
    
         
            +
                const int nwarps = NWARPS_Q8_0_AMPERE;
         
     | 
| 
      
 3546 
     | 
    
         
            +
             
     | 
| 
      
 3547 
     | 
    
         
            +
                mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         
     | 
| 
      
 3548 
     | 
    
         
            +
                    load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
         
     | 
| 
      
 3549 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3550 
     | 
    
         
            +
             
     | 
| 
      
 3551 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3552 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
         
     | 
| 
      
 3553 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
         
     | 
| 
      
 3554 
     | 
    
         
            +
                const int nwarps = NWARPS_Q8_0_PASCAL;
         
     | 
| 
      
 3555 
     | 
    
         
            +
             
     | 
| 
      
 3556 
     | 
    
         
            +
                mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         
     | 
| 
      
 3557 
     | 
    
         
            +
                    load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
         
     | 
| 
      
 3558 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3559 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3560 
     | 
    
         
            +
                (void) vec_dot_q8_0_q8_1_mul_mat;
         
     | 
| 
      
 3561 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3562 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3563 
     | 
    
         
            +
            }
         
     | 
| 
      
 3564 
     | 
    
         
            +
             
     | 
| 
      
 3565 
     | 
    
         
            +
            #define  MMQ_X_Q2_K_AMPERE 64
         
     | 
| 
      
 3566 
     | 
    
         
            +
            #define  MMQ_Y_Q2_K_AMPERE 128
         
     | 
| 
      
 3567 
     | 
    
         
            +
            #define NWARPS_Q2_K_AMPERE 4
         
     | 
| 
      
 3568 
     | 
    
         
            +
            #define  MMQ_X_Q2_K_PASCAL 64
         
     | 
| 
      
 3569 
     | 
    
         
            +
            #define  MMQ_Y_Q2_K_PASCAL 64
         
     | 
| 
      
 3570 
     | 
    
         
            +
            #define NWARPS_Q2_K_PASCAL 8
         
     | 
| 
      
 3571 
     | 
    
         
            +
             
     | 
| 
      
 3572 
     | 
    
         
            +
            template <bool need_check> static __global__ void mul_mat_q2_K(
         
     | 
| 
      
 3573 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3574 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3575 
     | 
    
         
            +
             
     | 
| 
      
 3576 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3577 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
         
     | 
| 
      
 3578 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
         
     | 
| 
      
 3579 
     | 
    
         
            +
                const int nwarps = NWARPS_Q2_K_AMPERE;
         
     | 
| 
      
 3580 
     | 
    
         
            +
             
     | 
| 
      
 3581 
     | 
    
         
            +
                mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         
     | 
| 
      
 3582 
     | 
    
         
            +
                    load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
         
     | 
| 
      
 3583 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3584 
     | 
    
         
            +
             
     | 
| 
      
 3585 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3586 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
         
     | 
| 
      
 3587 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
         
     | 
| 
      
 3588 
     | 
    
         
            +
                const int nwarps = NWARPS_Q2_K_PASCAL;
         
     | 
| 
      
 3589 
     | 
    
         
            +
             
     | 
| 
      
 3590 
     | 
    
         
            +
                mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         
     | 
| 
      
 3591 
     | 
    
         
            +
                    load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
         
     | 
| 
      
 3592 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3593 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3594 
     | 
    
         
            +
                (void) vec_dot_q2_K_q8_1_mul_mat;
         
     | 
| 
      
 3595 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3596 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3597 
     | 
    
         
            +
            }
         
     | 
| 
      
 3598 
     | 
    
         
            +
             
     | 
| 
      
 3599 
     | 
    
         
            +
            #define  MMQ_X_Q3_K_AMPERE 128
         
     | 
| 
      
 3600 
     | 
    
         
            +
            #define  MMQ_Y_Q3_K_AMPERE 128
         
     | 
| 
      
 3601 
     | 
    
         
            +
            #define NWARPS_Q3_K_AMPERE 4
         
     | 
| 
      
 3602 
     | 
    
         
            +
            #define  MMQ_X_Q3_K_PASCAL 64
         
     | 
| 
      
 3603 
     | 
    
         
            +
            #define  MMQ_Y_Q3_K_PASCAL 64
         
     | 
| 
      
 3604 
     | 
    
         
            +
            #define NWARPS_Q3_K_PASCAL 8
         
     | 
| 
      
 3605 
     | 
    
         
            +
             
     | 
| 
      
 3606 
     | 
    
         
            +
            template <bool need_check> static __global__ void
         
     | 
| 
      
 3607 
     | 
    
         
            +
            #if __CUDA_ARCH__ < CC_TURING
         
     | 
| 
      
 3608 
     | 
    
         
            +
                __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
         
     | 
| 
      
 3609 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ < CC_TURING
         
     | 
| 
      
 3610 
     | 
    
         
            +
                mul_mat_q3_K(
         
     | 
| 
      
 3611 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3612 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3613 
     | 
    
         
            +
             
     | 
| 
      
 3614 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3615 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
         
     | 
| 
      
 3616 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
         
     | 
| 
      
 3617 
     | 
    
         
            +
                const int nwarps = NWARPS_Q3_K_AMPERE;
         
     | 
| 
      
 3618 
     | 
    
         
            +
             
     | 
| 
      
 3619 
     | 
    
         
            +
                mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         
     | 
| 
      
 3620 
     | 
    
         
            +
                    load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
         
     | 
| 
      
 3621 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3622 
     | 
    
         
            +
             
     | 
| 
      
 3623 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3624 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
         
     | 
| 
      
 3625 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
         
     | 
| 
      
 3626 
     | 
    
         
            +
                const int nwarps = NWARPS_Q3_K_PASCAL;
         
     | 
| 
      
 3627 
     | 
    
         
            +
             
     | 
| 
      
 3628 
     | 
    
         
            +
                mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         
     | 
| 
      
 3629 
     | 
    
         
            +
                    load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
         
     | 
| 
      
 3630 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3631 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3632 
     | 
    
         
            +
                (void) vec_dot_q3_K_q8_1_mul_mat;
         
     | 
| 
      
 3633 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3634 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3635 
     | 
    
         
            +
            }
         
     | 
| 
      
 3636 
     | 
    
         
            +
             
     | 
| 
      
 3637 
     | 
    
         
            +
            #define  MMQ_X_Q4_K_AMPERE 64
         
     | 
| 
      
 3638 
     | 
    
         
            +
            #define  MMQ_Y_Q4_K_AMPERE 128
         
     | 
| 
      
 3639 
     | 
    
         
            +
            #define NWARPS_Q4_K_AMPERE 4
         
     | 
| 
      
 3640 
     | 
    
         
            +
            #define  MMQ_X_Q4_K_PASCAL 64
         
     | 
| 
      
 3641 
     | 
    
         
            +
            #define  MMQ_Y_Q4_K_PASCAL 64
         
     | 
| 
      
 3642 
     | 
    
         
            +
            #define NWARPS_Q4_K_PASCAL 8
         
     | 
| 
      
 3643 
     | 
    
         
            +
             
     | 
| 
      
 3644 
     | 
    
         
            +
            template <bool need_check> static __global__ void
         
     | 
| 
      
 3645 
     | 
    
         
            +
            #if __CUDA_ARCH__ < CC_TURING
         
     | 
| 
      
 3646 
     | 
    
         
            +
                __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
         
     | 
| 
      
 3647 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ < CC_TURING
         
     | 
| 
      
 3648 
     | 
    
         
            +
                mul_mat_q4_K(
         
     | 
| 
      
 3649 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3650 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3651 
     | 
    
         
            +
             
     | 
| 
      
 3652 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3653 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
         
     | 
| 
      
 3654 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
         
     | 
| 
      
 3655 
     | 
    
         
            +
                const int nwarps = NWARPS_Q4_K_AMPERE;
         
     | 
| 
      
 3656 
     | 
    
         
            +
             
     | 
| 
      
 3657 
     | 
    
         
            +
                mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         
     | 
| 
      
 3658 
     | 
    
         
            +
                    load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
         
     | 
| 
      
 3659 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3660 
     | 
    
         
            +
             
     | 
| 
      
 3661 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3662 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
         
     | 
| 
      
 3663 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
         
     | 
| 
      
 3664 
     | 
    
         
            +
                const int nwarps = NWARPS_Q4_K_PASCAL;
         
     | 
| 
      
 3665 
     | 
    
         
            +
             
     | 
| 
      
 3666 
     | 
    
         
            +
                mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         
     | 
| 
      
 3667 
     | 
    
         
            +
                    load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
         
     | 
| 
      
 3668 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3669 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3670 
     | 
    
         
            +
                (void) vec_dot_q4_K_q8_1_mul_mat;
         
     | 
| 
      
 3671 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3672 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3673 
     | 
    
         
            +
            }
         
     | 
| 
      
 3674 
     | 
    
         
            +
             
     | 
| 
      
 3675 
     | 
    
         
            +
            #define  MMQ_X_Q5_K_AMPERE 64
         
     | 
| 
      
 3676 
     | 
    
         
            +
            #define  MMQ_Y_Q5_K_AMPERE 128
         
     | 
| 
      
 3677 
     | 
    
         
            +
            #define NWARPS_Q5_K_AMPERE 4
         
     | 
| 
      
 3678 
     | 
    
         
            +
            #define  MMQ_X_Q5_K_PASCAL 64
         
     | 
| 
      
 3679 
     | 
    
         
            +
            #define  MMQ_Y_Q5_K_PASCAL 64
         
     | 
| 
      
 3680 
     | 
    
         
            +
            #define NWARPS_Q5_K_PASCAL 8
         
     | 
| 
      
 3681 
     | 
    
         
            +
             
     | 
| 
      
 3682 
     | 
    
         
            +
            template <bool need_check> static __global__ void mul_mat_q5_K(
         
     | 
| 
      
 3683 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3684 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3685 
     | 
    
         
            +
             
     | 
| 
      
 3686 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3687 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
         
     | 
| 
      
 3688 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
         
     | 
| 
      
 3689 
     | 
    
         
            +
                const int nwarps = NWARPS_Q5_K_AMPERE;
         
     | 
| 
      
 3690 
     | 
    
         
            +
             
     | 
| 
      
 3691 
     | 
    
         
            +
                mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         
     | 
| 
      
 3692 
     | 
    
         
            +
                    load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
         
     | 
| 
      
 3693 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3694 
     | 
    
         
            +
             
     | 
| 
      
 3695 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3696 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
         
     | 
| 
      
 3697 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
         
     | 
| 
      
 3698 
     | 
    
         
            +
                const int nwarps = NWARPS_Q5_K_PASCAL;
         
     | 
| 
      
 3699 
     | 
    
         
            +
             
     | 
| 
      
 3700 
     | 
    
         
            +
                mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         
     | 
| 
      
 3701 
     | 
    
         
            +
                    load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
         
     | 
| 
      
 3702 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3703 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3704 
     | 
    
         
            +
                (void) vec_dot_q5_K_q8_1_mul_mat;
         
     | 
| 
      
 3705 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3706 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3707 
     | 
    
         
            +
            }
         
     | 
| 
      
 3708 
     | 
    
         
            +
             
     | 
| 
      
 3709 
     | 
    
         
            +
            #define  MMQ_X_Q6_K_AMPERE 64
         
     | 
| 
      
 3710 
     | 
    
         
            +
            #define  MMQ_Y_Q6_K_AMPERE 64
         
     | 
| 
      
 3711 
     | 
    
         
            +
            #define NWARPS_Q6_K_AMPERE 4
         
     | 
| 
      
 3712 
     | 
    
         
            +
            #define  MMQ_X_Q6_K_PASCAL 64
         
     | 
| 
      
 3713 
     | 
    
         
            +
            #define  MMQ_Y_Q6_K_PASCAL 64
         
     | 
| 
      
 3714 
     | 
    
         
            +
            #define NWARPS_Q6_K_PASCAL 8
         
     | 
| 
      
 3715 
     | 
    
         
            +
             
     | 
| 
      
 3716 
     | 
    
         
            +
            template <bool need_check> static __global__ void
         
     | 
| 
      
 3717 
     | 
    
         
            +
            #if __CUDA_ARCH__ < CC_TURING
         
     | 
| 
      
 3718 
     | 
    
         
            +
                __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
         
     | 
| 
      
 3719 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ < CC_TURING
         
     | 
| 
      
 3720 
     | 
    
         
            +
                mul_mat_q6_K(
         
     | 
| 
      
 3721 
     | 
    
         
            +
                const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
         
     | 
| 
      
 3722 
     | 
    
         
            +
                const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
         
     | 
| 
      
 3723 
     | 
    
         
            +
             
     | 
| 
      
 3724 
     | 
    
         
            +
            #if __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3725 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
         
     | 
| 
      
 3726 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
         
     | 
| 
      
 3727 
     | 
    
         
            +
                const int nwarps = NWARPS_Q6_K_AMPERE;
         
     | 
| 
      
 3728 
     | 
    
         
            +
             
     | 
| 
      
 3729 
     | 
    
         
            +
                mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         
     | 
| 
      
 3730 
     | 
    
         
            +
                    load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
         
     | 
| 
      
 3731 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3732 
     | 
    
         
            +
             
     | 
| 
      
 3733 
     | 
    
         
            +
            #elif __CUDA_ARCH__ >= MIN_CC_DP4A
         
     | 
| 
      
 3734 
     | 
    
         
            +
                const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
         
     | 
| 
      
 3735 
     | 
    
         
            +
                const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
         
     | 
| 
      
 3736 
     | 
    
         
            +
                const int nwarps = NWARPS_Q6_K_PASCAL;
         
     | 
| 
      
 3737 
     | 
    
         
            +
             
     | 
| 
      
 3738 
     | 
    
         
            +
                mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         
     | 
| 
      
 3739 
     | 
    
         
            +
                    load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
         
     | 
| 
      
 3740 
     | 
    
         
            +
                    (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 3741 
     | 
    
         
            +
            #else
         
     | 
| 
      
 3742 
     | 
    
         
            +
                (void) vec_dot_q6_K_q8_1_mul_mat;
         
     | 
| 
      
 3743 
     | 
    
         
            +
                assert(false);
         
     | 
| 
      
 3744 
     | 
    
         
            +
            #endif // __CUDA_ARCH__ >= CC_TURING
         
     | 
| 
      
 3745 
     | 
    
         
            +
            }
         
     | 
| 
      
 3746 
     | 
    
         
            +
             
     | 
| 
       3244 
3747 
     | 
    
         
             
            template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
         
     | 
| 
       3245 
3748 
     | 
    
         
             
            static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
         
     | 
| 
       3246 
3749 
     | 
    
         
             
                const int row = blockIdx.y*blockDim.y + threadIdx.y;
         
     | 
| 
         @@ -3485,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, 
     | 
|
| 
       3485 
3988 
     | 
    
         
             
            // rope == RoPE == rotary positional embedding
         
     | 
| 
       3486 
3989 
     | 
    
         
             
            static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
         
     | 
| 
       3487 
3990 
     | 
    
         
             
                                            const float p_delta, const int p_delta_rows, const float theta_scale) {
         
     | 
| 
       3488 
     | 
    
         
            -
                const int col = 2*(blockDim. 
     | 
| 
      
 3991 
     | 
    
         
            +
                const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
         
     | 
| 
       3489 
3992 
     | 
    
         | 
| 
       3490 
3993 
     | 
    
         
             
                if (col >= ncols) {
         
     | 
| 
       3491 
3994 
     | 
    
         
             
                    return;
         
     | 
| 
       3492 
3995 
     | 
    
         
             
                }
         
     | 
| 
       3493 
3996 
     | 
    
         | 
| 
       3494 
     | 
    
         
            -
                const int row = blockDim. 
     | 
| 
      
 3997 
     | 
    
         
            +
                const int row = blockDim.x*blockIdx.x + threadIdx.x;
         
     | 
| 
       3495 
3998 
     | 
    
         
             
                const int i = row*ncols + col;
         
     | 
| 
       3496 
3999 
     | 
    
         | 
| 
       3497 
4000 
     | 
    
         
             
                const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
         
     | 
| 
         @@ -3505,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c 
     | 
|
| 
       3505 
4008 
     | 
    
         
             
                dst[i + 1] = x0*sin_theta + x1*cos_theta;
         
     | 
| 
       3506 
4009 
     | 
    
         
             
            }
         
     | 
| 
       3507 
4010 
     | 
    
         | 
| 
      
 4011 
     | 
    
         
            +
            static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
         
     | 
| 
      
 4012 
     | 
    
         
            +
                                            const float p_delta, const int p_delta_rows, const float theta_scale) {
         
     | 
| 
      
 4013 
     | 
    
         
            +
                const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
         
     | 
| 
      
 4014 
     | 
    
         
            +
             
     | 
| 
      
 4015 
     | 
    
         
            +
                if (col >= ncols) {
         
     | 
| 
      
 4016 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 4017 
     | 
    
         
            +
                }
         
     | 
| 
      
 4018 
     | 
    
         
            +
             
     | 
| 
      
 4019 
     | 
    
         
            +
                const int row = blockDim.x*blockIdx.x + threadIdx.x;
         
     | 
| 
      
 4020 
     | 
    
         
            +
                const int i = row*ncols + col/2;
         
     | 
| 
      
 4021 
     | 
    
         
            +
             
     | 
| 
      
 4022 
     | 
    
         
            +
                const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
         
     | 
| 
      
 4023 
     | 
    
         
            +
                const float sin_theta = sinf(theta);
         
     | 
| 
      
 4024 
     | 
    
         
            +
                const float cos_theta = cosf(theta);
         
     | 
| 
      
 4025 
     | 
    
         
            +
             
     | 
| 
      
 4026 
     | 
    
         
            +
                const float x0 = x[i + 0];
         
     | 
| 
      
 4027 
     | 
    
         
            +
                const float x1 = x[i + ncols/2];
         
     | 
| 
      
 4028 
     | 
    
         
            +
             
     | 
| 
      
 4029 
     | 
    
         
            +
                dst[i + 0]       = x0*cos_theta - x1*sin_theta;
         
     | 
| 
      
 4030 
     | 
    
         
            +
                dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
         
     | 
| 
      
 4031 
     | 
    
         
            +
            }
         
     | 
| 
      
 4032 
     | 
    
         
            +
             
     | 
| 
       3508 
4033 
     | 
    
         
             
            static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
         
     | 
| 
       3509 
4034 
     | 
    
         
             
                const int col = blockDim.x*blockIdx.x + threadIdx.x;
         
     | 
| 
       3510 
4035 
     | 
    
         
             
                const int half_n_dims = ncols/4;
         
     | 
| 
         @@ -3539,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol 
     | 
|
| 
       3539 
4064 
     | 
    
         
             
                dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
         
     | 
| 
       3540 
4065 
     | 
    
         
             
            }
         
     | 
| 
       3541 
4066 
     | 
    
         | 
| 
       3542 
     | 
    
         
            -
            static __global__ void  
     | 
| 
      
 4067 
     | 
    
         
            +
            static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
         
     | 
| 
      
 4068 
     | 
    
         
            +
                                             const int n_heads_log2_floor, const float m0, const float m1) {
         
     | 
| 
       3543 
4069 
     | 
    
         
             
                const int col = blockDim.x*blockIdx.x + threadIdx.x;
         
     | 
| 
      
 4070 
     | 
    
         
            +
             
     | 
| 
      
 4071 
     | 
    
         
            +
                if (col >= ncols) {
         
     | 
| 
      
 4072 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 4073 
     | 
    
         
            +
                }
         
     | 
| 
      
 4074 
     | 
    
         
            +
             
     | 
| 
       3544 
4075 
     | 
    
         
             
                const int row = blockDim.y*blockIdx.y + threadIdx.y;
         
     | 
| 
      
 4076 
     | 
    
         
            +
                const int i = row*ncols + col;
         
     | 
| 
      
 4077 
     | 
    
         
            +
             
     | 
| 
      
 4078 
     | 
    
         
            +
                const int k = row/k_rows;
         
     | 
| 
      
 4079 
     | 
    
         
            +
             
     | 
| 
      
 4080 
     | 
    
         
            +
                float m_k;
         
     | 
| 
      
 4081 
     | 
    
         
            +
                if (k < n_heads_log2_floor) {
         
     | 
| 
      
 4082 
     | 
    
         
            +
                    m_k = powf(m0, k + 1);
         
     | 
| 
      
 4083 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4084 
     | 
    
         
            +
                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
         
     | 
| 
      
 4085 
     | 
    
         
            +
                }
         
     | 
| 
      
 4086 
     | 
    
         
            +
             
     | 
| 
      
 4087 
     | 
    
         
            +
                dst[i] = col * m_k + x[i];
         
     | 
| 
      
 4088 
     | 
    
         
            +
            }
         
     | 
| 
      
 4089 
     | 
    
         
            +
             
     | 
| 
      
 4090 
     | 
    
         
            +
            static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
         
     | 
| 
      
 4091 
     | 
    
         
            +
                const int col = blockDim.y*blockIdx.y + threadIdx.y;
         
     | 
| 
      
 4092 
     | 
    
         
            +
                const int row = blockDim.x*blockIdx.x + threadIdx.x;
         
     | 
| 
       3545 
4093 
     | 
    
         | 
| 
       3546 
4094 
     | 
    
         
             
                if (col >= ncols) {
         
     | 
| 
       3547 
4095 
     | 
    
         
             
                    return;
         
     | 
| 
         @@ -3554,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int 
     | 
|
| 
       3554 
4102 
     | 
    
         | 
| 
       3555 
4103 
     | 
    
         
             
            // the CUDA soft max implementation differs from the CPU implementation
         
     | 
| 
       3556 
4104 
     | 
    
         
             
            // instead of doubles floats are used
         
     | 
| 
       3557 
     | 
    
         
            -
            // values are also not normalized to the maximum value by subtracting it in the exponential function
         
     | 
| 
       3558 
     | 
    
         
            -
            // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
         
     | 
| 
       3559 
4105 
     | 
    
         
             
            static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
         
     | 
| 
       3560 
     | 
    
         
            -
                const int row = blockDim. 
     | 
| 
       3561 
     | 
    
         
            -
                const int block_size = blockDim. 
     | 
| 
       3562 
     | 
    
         
            -
                const int tid = threadIdx. 
     | 
| 
      
 4106 
     | 
    
         
            +
                const int row = blockDim.x*blockIdx.x + threadIdx.x;
         
     | 
| 
      
 4107 
     | 
    
         
            +
                const int block_size = blockDim.y;
         
     | 
| 
      
 4108 
     | 
    
         
            +
                const int tid = threadIdx.y;
         
     | 
| 
       3563 
4109 
     | 
    
         | 
| 
       3564 
     | 
    
         
            -
                float  
     | 
| 
      
 4110 
     | 
    
         
            +
                float max_val = -INFINITY;
         
     | 
| 
       3565 
4111 
     | 
    
         | 
| 
       3566 
     | 
    
         
            -
                for (int  
     | 
| 
       3567 
     | 
    
         
            -
                    const int  
     | 
| 
      
 4112 
     | 
    
         
            +
                for (int col = tid; col < ncols; col += block_size) {
         
     | 
| 
      
 4113 
     | 
    
         
            +
                    const int i = row*ncols + col;
         
     | 
| 
      
 4114 
     | 
    
         
            +
                    max_val = max(max_val, x[i]);
         
     | 
| 
      
 4115 
     | 
    
         
            +
                }
         
     | 
| 
       3568 
4116 
     | 
    
         | 
| 
       3569 
     | 
    
         
            -
             
     | 
| 
       3570 
     | 
    
         
            -
             
     | 
| 
       3571 
     | 
    
         
            -
             
     | 
| 
      
 4117 
     | 
    
         
            +
                // find the max value in the block
         
     | 
| 
      
 4118 
     | 
    
         
            +
            #pragma unroll
         
     | 
| 
      
 4119 
     | 
    
         
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         
     | 
| 
      
 4120 
     | 
    
         
            +
                    max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
         
     | 
| 
      
 4121 
     | 
    
         
            +
                }
         
     | 
| 
       3572 
4122 
     | 
    
         | 
| 
      
 4123 
     | 
    
         
            +
                float tmp = 0.f;
         
     | 
| 
      
 4124 
     | 
    
         
            +
             
     | 
| 
      
 4125 
     | 
    
         
            +
                for (int col = tid; col < ncols; col += block_size) {
         
     | 
| 
       3573 
4126 
     | 
    
         
             
                    const int i = row*ncols + col;
         
     | 
| 
       3574 
     | 
    
         
            -
                    const float val = expf(x[i]);
         
     | 
| 
      
 4127 
     | 
    
         
            +
                    const float val = expf(x[i] - max_val);
         
     | 
| 
       3575 
4128 
     | 
    
         
             
                    tmp += val;
         
     | 
| 
       3576 
4129 
     | 
    
         
             
                    dst[i] = val;
         
     | 
| 
       3577 
4130 
     | 
    
         
             
                }
         
     | 
| 
         @@ -3582,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol 
     | 
|
| 
       3582 
4135 
     | 
    
         
             
                    tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
         
     | 
| 
       3583 
4136 
     | 
    
         
             
                }
         
     | 
| 
       3584 
4137 
     | 
    
         | 
| 
       3585 
     | 
    
         
            -
                 
     | 
| 
       3586 
     | 
    
         
            -
                    const int col = block_start + tid;
         
     | 
| 
       3587 
     | 
    
         
            -
             
     | 
| 
       3588 
     | 
    
         
            -
                    if (col >= ncols) {
         
     | 
| 
       3589 
     | 
    
         
            -
                        break;
         
     | 
| 
       3590 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4138 
     | 
    
         
            +
                const float inv_tmp = 1.f / tmp;
         
     | 
| 
       3591 
4139 
     | 
    
         | 
| 
      
 4140 
     | 
    
         
            +
                for (int col = tid; col < ncols; col += block_size) {
         
     | 
| 
       3592 
4141 
     | 
    
         
             
                    const int i = row*ncols + col;
         
     | 
| 
       3593 
     | 
    
         
            -
                    dst[i]  
     | 
| 
      
 4142 
     | 
    
         
            +
                    dst[i] *= inv_tmp;
         
     | 
| 
       3594 
4143 
     | 
    
         
             
                }
         
     | 
| 
       3595 
4144 
     | 
    
         
             
            }
         
     | 
| 
       3596 
4145 
     | 
    
         | 
| 
         @@ -3942,48 +4491,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( 
     | 
|
| 
       3942 
4491 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       3943 
4492 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       3944 
4493 
     | 
    
         | 
| 
      
 4494 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       3945 
4495 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       3946 
     | 
    
         
            -
                     
     | 
| 
       3947 
     | 
    
         
            -
                     
     | 
| 
       3948 
     | 
    
         
            -
                     
     | 
| 
       3949 
     | 
    
         
            -
             
     | 
| 
       3950 
     | 
    
         
            -
                     
     | 
| 
       3951 
     | 
    
         
            -
                     
     | 
| 
       3952 
     | 
    
         
            -
                     
     | 
| 
       3953 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       3954 
     | 
    
         
            -
             
     | 
| 
       3955 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       3956 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       3957 
     | 
    
         
            -
                        mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         
     | 
| 
       3958 
     | 
    
         
            -
                            load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
         
     | 
| 
       3959 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       3960 
     | 
    
         
            -
                    } else {
         
     | 
| 
       3961 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       3962 
     | 
    
         
            -
                        mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         
     | 
| 
       3963 
     | 
    
         
            -
                            load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
         
     | 
| 
       3964 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       3965 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4496 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q4_0_AMPERE;
         
     | 
| 
      
 4497 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q4_0_AMPERE;
         
     | 
| 
      
 4498 
     | 
    
         
            +
                    nwarps = NWARPS_Q4_0_AMPERE;
         
     | 
| 
      
 4499 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4500 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q4_0_PASCAL;
         
     | 
| 
      
 4501 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q4_0_PASCAL;
         
     | 
| 
      
 4502 
     | 
    
         
            +
                    nwarps = NWARPS_Q4_0_PASCAL;
         
     | 
| 
       3966 
4503 
     | 
    
         
             
                } else {
         
     | 
| 
       3967 
     | 
    
         
            -
                     
     | 
| 
       3968 
     | 
    
         
            -
             
     | 
| 
       3969 
     | 
    
         
            -
             
     | 
| 
       3970 
     | 
    
         
            -
             
     | 
| 
       3971 
     | 
    
         
            -
             
     | 
| 
       3972 
     | 
    
         
            -
             
     | 
| 
       3973 
     | 
    
         
            -
             
     | 
| 
       3974 
     | 
    
         
            -
             
     | 
| 
       3975 
     | 
    
         
            -
             
     | 
| 
       3976 
     | 
    
         
            -
                     
     | 
| 
       3977 
     | 
    
         
            -
             
     | 
| 
       3978 
     | 
    
         
            -
                         
     | 
| 
       3979 
     | 
    
         
            -
             
     | 
| 
       3980 
     | 
    
         
            -
             
     | 
| 
       3981 
     | 
    
         
            -
                     
     | 
| 
       3982 
     | 
    
         
            -
                         
     | 
| 
       3983 
     | 
    
         
            -
                        mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         
     | 
| 
       3984 
     | 
    
         
            -
                            load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
         
     | 
| 
       3985 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       3986 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4504 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4505 
     | 
    
         
            +
                }
         
     | 
| 
      
 4506 
     | 
    
         
            +
             
     | 
| 
      
 4507 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4508 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4509 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4510 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4511 
     | 
    
         
            +
             
     | 
| 
      
 4512 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4513 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4514 
     | 
    
         
            +
                    mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4515 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4516 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4517 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4518 
     | 
    
         
            +
                    mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4519 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       3987 
4520 
     | 
    
         
             
                }
         
     | 
| 
       3988 
4521 
     | 
    
         
             
            }
         
     | 
| 
       3989 
4522 
     | 
    
         | 
| 
         @@ -3995,49 +4528,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( 
     | 
|
| 
       3995 
4528 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       3996 
4529 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       3997 
4530 
     | 
    
         | 
| 
      
 4531 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       3998 
4532 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       3999 
     | 
    
         
            -
                     
     | 
| 
       4000 
     | 
    
         
            -
                     
     | 
| 
       4001 
     | 
    
         
            -
                     
     | 
| 
       4002 
     | 
    
         
            -
             
     | 
| 
       4003 
     | 
    
         
            -
                     
     | 
| 
       4004 
     | 
    
         
            -
                     
     | 
| 
       4005 
     | 
    
         
            -
                     
     | 
| 
       4006 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4007 
     | 
    
         
            -
             
     | 
| 
       4008 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4009 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4010 
     | 
    
         
            -
                        mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         
     | 
| 
       4011 
     | 
    
         
            -
                            load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
         
     | 
| 
       4012 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4013 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4014 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4015 
     | 
    
         
            -
                        mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         
     | 
| 
       4016 
     | 
    
         
            -
                            load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
         
     | 
| 
       4017 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4018 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4533 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q4_1_AMPERE;
         
     | 
| 
      
 4534 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q4_1_AMPERE;
         
     | 
| 
      
 4535 
     | 
    
         
            +
                    nwarps = NWARPS_Q4_1_AMPERE;
         
     | 
| 
      
 4536 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4537 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q4_1_PASCAL;
         
     | 
| 
      
 4538 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q4_1_PASCAL;
         
     | 
| 
      
 4539 
     | 
    
         
            +
                    nwarps = NWARPS_Q4_1_PASCAL;
         
     | 
| 
       4019 
4540 
     | 
    
         
             
                } else {
         
     | 
| 
       4020 
     | 
    
         
            -
                     
     | 
| 
       4021 
     | 
    
         
            -
             
     | 
| 
       4022 
     | 
    
         
            -
                    const int nwarps = 8;
         
     | 
| 
       4023 
     | 
    
         
            -
             
     | 
| 
       4024 
     | 
    
         
            -
                    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
       4025 
     | 
    
         
            -
                    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
       4026 
     | 
    
         
            -
                    const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
       4027 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4028 
     | 
    
         
            -
             
     | 
| 
       4029 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4030 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4031 
     | 
    
         
            -
                        mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         
     | 
| 
       4032 
     | 
    
         
            -
                            load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
         
     | 
| 
       4033 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4034 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4035 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4036 
     | 
    
         
            -
                        mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         
     | 
| 
       4037 
     | 
    
         
            -
                            load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
         
     | 
| 
       4038 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4039 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4541 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4542 
     | 
    
         
            +
                }
         
     | 
| 
       4040 
4543 
     | 
    
         | 
| 
      
 4544 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4545 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4546 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4547 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4548 
     | 
    
         
            +
             
     | 
| 
      
 4549 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4550 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4551 
     | 
    
         
            +
                    mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4552 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4553 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4554 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4555 
     | 
    
         
            +
                    mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4556 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4041 
4557 
     | 
    
         
             
                }
         
     | 
| 
       4042 
4558 
     | 
    
         
             
            }
         
     | 
| 
       4043 
4559 
     | 
    
         | 
| 
         @@ -4049,48 +4565,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( 
     | 
|
| 
       4049 
4565 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       4050 
4566 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       4051 
4567 
     | 
    
         | 
| 
      
 4568 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       4052 
4569 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       4053 
     | 
    
         
            -
                     
     | 
| 
       4054 
     | 
    
         
            -
                     
     | 
| 
       4055 
     | 
    
         
            -
                     
     | 
| 
       4056 
     | 
    
         
            -
             
     | 
| 
       4057 
     | 
    
         
            -
                     
     | 
| 
       4058 
     | 
    
         
            -
                     
     | 
| 
       4059 
     | 
    
         
            -
                     
     | 
| 
       4060 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4061 
     | 
    
         
            -
             
     | 
| 
       4062 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4063 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4064 
     | 
    
         
            -
                        mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         
     | 
| 
       4065 
     | 
    
         
            -
                            load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
         
     | 
| 
       4066 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4067 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4068 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4069 
     | 
    
         
            -
                        mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         
     | 
| 
       4070 
     | 
    
         
            -
                            load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
         
     | 
| 
       4071 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4072 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4570 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q5_0_AMPERE;
         
     | 
| 
      
 4571 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q5_0_AMPERE;
         
     | 
| 
      
 4572 
     | 
    
         
            +
                    nwarps = NWARPS_Q5_0_AMPERE;
         
     | 
| 
      
 4573 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4574 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q5_0_PASCAL;
         
     | 
| 
      
 4575 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q5_0_PASCAL;
         
     | 
| 
      
 4576 
     | 
    
         
            +
                    nwarps = NWARPS_Q5_0_PASCAL;
         
     | 
| 
       4073 
4577 
     | 
    
         
             
                } else {
         
     | 
| 
       4074 
     | 
    
         
            -
                     
     | 
| 
       4075 
     | 
    
         
            -
             
     | 
| 
       4076 
     | 
    
         
            -
             
     | 
| 
       4077 
     | 
    
         
            -
             
     | 
| 
       4078 
     | 
    
         
            -
             
     | 
| 
       4079 
     | 
    
         
            -
             
     | 
| 
       4080 
     | 
    
         
            -
             
     | 
| 
       4081 
     | 
    
         
            -
             
     | 
| 
       4082 
     | 
    
         
            -
             
     | 
| 
       4083 
     | 
    
         
            -
                     
     | 
| 
       4084 
     | 
    
         
            -
             
     | 
| 
       4085 
     | 
    
         
            -
                         
     | 
| 
       4086 
     | 
    
         
            -
             
     | 
| 
       4087 
     | 
    
         
            -
             
     | 
| 
       4088 
     | 
    
         
            -
                     
     | 
| 
       4089 
     | 
    
         
            -
                         
     | 
| 
       4090 
     | 
    
         
            -
                        mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         
     | 
| 
       4091 
     | 
    
         
            -
                            load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
         
     | 
| 
       4092 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4093 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4578 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4579 
     | 
    
         
            +
                }
         
     | 
| 
      
 4580 
     | 
    
         
            +
             
     | 
| 
      
 4581 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4582 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4583 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4584 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4585 
     | 
    
         
            +
             
     | 
| 
      
 4586 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4587 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4588 
     | 
    
         
            +
                    mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4589 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4590 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4591 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4592 
     | 
    
         
            +
                    mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4593 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4094 
4594 
     | 
    
         
             
                }
         
     | 
| 
       4095 
4595 
     | 
    
         
             
            }
         
     | 
| 
       4096 
4596 
     | 
    
         | 
| 
         @@ -4102,48 +4602,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( 
     | 
|
| 
       4102 
4602 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       4103 
4603 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       4104 
4604 
     | 
    
         | 
| 
      
 4605 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       4105 
4606 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       4106 
     | 
    
         
            -
                     
     | 
| 
       4107 
     | 
    
         
            -
                     
     | 
| 
       4108 
     | 
    
         
            -
                     
     | 
| 
       4109 
     | 
    
         
            -
             
     | 
| 
       4110 
     | 
    
         
            -
                     
     | 
| 
       4111 
     | 
    
         
            -
                     
     | 
| 
       4112 
     | 
    
         
            -
                     
     | 
| 
       4113 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4114 
     | 
    
         
            -
             
     | 
| 
       4115 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4116 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4117 
     | 
    
         
            -
                        mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         
     | 
| 
       4118 
     | 
    
         
            -
                            load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
         
     | 
| 
       4119 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4120 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4121 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4122 
     | 
    
         
            -
                        mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         
     | 
| 
       4123 
     | 
    
         
            -
                            load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
         
     | 
| 
       4124 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4125 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4607 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q5_1_AMPERE;
         
     | 
| 
      
 4608 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q5_1_AMPERE;
         
     | 
| 
      
 4609 
     | 
    
         
            +
                    nwarps = NWARPS_Q5_1_AMPERE;
         
     | 
| 
      
 4610 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4611 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q5_1_PASCAL;
         
     | 
| 
      
 4612 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q5_1_PASCAL;
         
     | 
| 
      
 4613 
     | 
    
         
            +
                    nwarps = NWARPS_Q5_1_PASCAL;
         
     | 
| 
       4126 
4614 
     | 
    
         
             
                } else {
         
     | 
| 
       4127 
     | 
    
         
            -
                     
     | 
| 
       4128 
     | 
    
         
            -
             
     | 
| 
       4129 
     | 
    
         
            -
             
     | 
| 
       4130 
     | 
    
         
            -
             
     | 
| 
       4131 
     | 
    
         
            -
             
     | 
| 
       4132 
     | 
    
         
            -
             
     | 
| 
       4133 
     | 
    
         
            -
             
     | 
| 
       4134 
     | 
    
         
            -
             
     | 
| 
       4135 
     | 
    
         
            -
             
     | 
| 
       4136 
     | 
    
         
            -
                     
     | 
| 
       4137 
     | 
    
         
            -
             
     | 
| 
       4138 
     | 
    
         
            -
                         
     | 
| 
       4139 
     | 
    
         
            -
             
     | 
| 
       4140 
     | 
    
         
            -
             
     | 
| 
       4141 
     | 
    
         
            -
                     
     | 
| 
       4142 
     | 
    
         
            -
                         
     | 
| 
       4143 
     | 
    
         
            -
                        mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         
     | 
| 
       4144 
     | 
    
         
            -
                            load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
         
     | 
| 
       4145 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4146 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4615 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4616 
     | 
    
         
            +
                }
         
     | 
| 
      
 4617 
     | 
    
         
            +
             
     | 
| 
      
 4618 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4619 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4620 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4621 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4622 
     | 
    
         
            +
             
     | 
| 
      
 4623 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4624 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4625 
     | 
    
         
            +
                    mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4626 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4627 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4628 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4629 
     | 
    
         
            +
                    mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4630 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4147 
4631 
     | 
    
         
             
                }
         
     | 
| 
       4148 
4632 
     | 
    
         
             
            }
         
     | 
| 
       4149 
4633 
     | 
    
         | 
| 
         @@ -4155,48 +4639,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( 
     | 
|
| 
       4155 
4639 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       4156 
4640 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       4157 
4641 
     | 
    
         | 
| 
      
 4642 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       4158 
4643 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       4159 
     | 
    
         
            -
                     
     | 
| 
       4160 
     | 
    
         
            -
                     
     | 
| 
       4161 
     | 
    
         
            -
                     
     | 
| 
       4162 
     | 
    
         
            -
             
     | 
| 
       4163 
     | 
    
         
            -
                     
     | 
| 
       4164 
     | 
    
         
            -
                     
     | 
| 
       4165 
     | 
    
         
            -
                     
     | 
| 
       4166 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4167 
     | 
    
         
            -
             
     | 
| 
       4168 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4169 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4170 
     | 
    
         
            -
                        mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         
     | 
| 
       4171 
     | 
    
         
            -
                            load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
         
     | 
| 
       4172 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4173 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4174 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4175 
     | 
    
         
            -
                        mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         
     | 
| 
       4176 
     | 
    
         
            -
                            load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
         
     | 
| 
       4177 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4178 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4644 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q8_0_AMPERE;
         
     | 
| 
      
 4645 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q8_0_AMPERE;
         
     | 
| 
      
 4646 
     | 
    
         
            +
                    nwarps = NWARPS_Q8_0_AMPERE;
         
     | 
| 
      
 4647 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4648 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q8_0_PASCAL;
         
     | 
| 
      
 4649 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q8_0_PASCAL;
         
     | 
| 
      
 4650 
     | 
    
         
            +
                    nwarps = NWARPS_Q8_0_PASCAL;
         
     | 
| 
       4179 
4651 
     | 
    
         
             
                } else {
         
     | 
| 
       4180 
     | 
    
         
            -
                     
     | 
| 
       4181 
     | 
    
         
            -
             
     | 
| 
       4182 
     | 
    
         
            -
             
     | 
| 
       4183 
     | 
    
         
            -
             
     | 
| 
       4184 
     | 
    
         
            -
             
     | 
| 
       4185 
     | 
    
         
            -
             
     | 
| 
       4186 
     | 
    
         
            -
             
     | 
| 
       4187 
     | 
    
         
            -
             
     | 
| 
       4188 
     | 
    
         
            -
             
     | 
| 
       4189 
     | 
    
         
            -
                     
     | 
| 
       4190 
     | 
    
         
            -
             
     | 
| 
       4191 
     | 
    
         
            -
                         
     | 
| 
       4192 
     | 
    
         
            -
             
     | 
| 
       4193 
     | 
    
         
            -
             
     | 
| 
       4194 
     | 
    
         
            -
                     
     | 
| 
       4195 
     | 
    
         
            -
                         
     | 
| 
       4196 
     | 
    
         
            -
                        mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         
     | 
| 
       4197 
     | 
    
         
            -
                            load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
         
     | 
| 
       4198 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4199 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4652 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4653 
     | 
    
         
            +
                }
         
     | 
| 
      
 4654 
     | 
    
         
            +
             
     | 
| 
      
 4655 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4656 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4657 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4658 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4659 
     | 
    
         
            +
             
     | 
| 
      
 4660 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4661 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4662 
     | 
    
         
            +
                    mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4663 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4664 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4665 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4666 
     | 
    
         
            +
                    mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4667 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4200 
4668 
     | 
    
         
             
                }
         
     | 
| 
       4201 
4669 
     | 
    
         
             
            }
         
     | 
| 
       4202 
4670 
     | 
    
         | 
| 
         @@ -4208,48 +4676,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( 
     | 
|
| 
       4208 
4676 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       4209 
4677 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       4210 
4678 
     | 
    
         | 
| 
      
 4679 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       4211 
4680 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       4212 
     | 
    
         
            -
                     
     | 
| 
       4213 
     | 
    
         
            -
                     
     | 
| 
       4214 
     | 
    
         
            -
                     
     | 
| 
       4215 
     | 
    
         
            -
             
     | 
| 
       4216 
     | 
    
         
            -
                     
     | 
| 
       4217 
     | 
    
         
            -
                     
     | 
| 
       4218 
     | 
    
         
            -
                     
     | 
| 
       4219 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4220 
     | 
    
         
            -
             
     | 
| 
       4221 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4222 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4223 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         
     | 
| 
       4224 
     | 
    
         
            -
                            load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
         
     | 
| 
       4225 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4226 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4227 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4228 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         
     | 
| 
       4229 
     | 
    
         
            -
                            load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
         
     | 
| 
       4230 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4231 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4681 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q2_K_AMPERE;
         
     | 
| 
      
 4682 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q2_K_AMPERE;
         
     | 
| 
      
 4683 
     | 
    
         
            +
                    nwarps = NWARPS_Q2_K_AMPERE;
         
     | 
| 
      
 4684 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4685 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q2_K_PASCAL;
         
     | 
| 
      
 4686 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q2_K_PASCAL;
         
     | 
| 
      
 4687 
     | 
    
         
            +
                    nwarps = NWARPS_Q2_K_PASCAL;
         
     | 
| 
       4232 
4688 
     | 
    
         
             
                } else {
         
     | 
| 
       4233 
     | 
    
         
            -
                     
     | 
| 
       4234 
     | 
    
         
            -
             
     | 
| 
       4235 
     | 
    
         
            -
             
     | 
| 
       4236 
     | 
    
         
            -
             
     | 
| 
       4237 
     | 
    
         
            -
             
     | 
| 
       4238 
     | 
    
         
            -
             
     | 
| 
       4239 
     | 
    
         
            -
             
     | 
| 
       4240 
     | 
    
         
            -
             
     | 
| 
       4241 
     | 
    
         
            -
             
     | 
| 
       4242 
     | 
    
         
            -
                     
     | 
| 
       4243 
     | 
    
         
            -
             
     | 
| 
       4244 
     | 
    
         
            -
                         
     | 
| 
       4245 
     | 
    
         
            -
             
     | 
| 
       4246 
     | 
    
         
            -
             
     | 
| 
       4247 
     | 
    
         
            -
                     
     | 
| 
       4248 
     | 
    
         
            -
                         
     | 
| 
       4249 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         
     | 
| 
       4250 
     | 
    
         
            -
                            load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
         
     | 
| 
       4251 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4252 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4689 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4690 
     | 
    
         
            +
                }
         
     | 
| 
      
 4691 
     | 
    
         
            +
             
     | 
| 
      
 4692 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4693 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4694 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4695 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4696 
     | 
    
         
            +
             
     | 
| 
      
 4697 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4698 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4699 
     | 
    
         
            +
                    mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4700 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4701 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4702 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4703 
     | 
    
         
            +
                    mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4704 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4253 
4705 
     | 
    
         
             
                }
         
     | 
| 
       4254 
4706 
     | 
    
         
             
            }
         
     | 
| 
       4255 
4707 
     | 
    
         | 
| 
         @@ -4261,48 +4713,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( 
     | 
|
| 
       4261 
4713 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       4262 
4714 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       4263 
4715 
     | 
    
         | 
| 
      
 4716 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       4264 
4717 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       4265 
     | 
    
         
            -
                     
     | 
| 
       4266 
     | 
    
         
            -
                     
     | 
| 
       4267 
     | 
    
         
            -
                     
     | 
| 
       4268 
     | 
    
         
            -
             
     | 
| 
       4269 
     | 
    
         
            -
                     
     | 
| 
       4270 
     | 
    
         
            -
                     
     | 
| 
       4271 
     | 
    
         
            -
                     
     | 
| 
       4272 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4273 
     | 
    
         
            -
             
     | 
| 
       4274 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4275 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4276 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         
     | 
| 
       4277 
     | 
    
         
            -
                            load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
         
     | 
| 
       4278 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4279 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4280 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4281 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         
     | 
| 
       4282 
     | 
    
         
            -
                            load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
         
     | 
| 
       4283 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4284 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4718 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q3_K_AMPERE;
         
     | 
| 
      
 4719 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q3_K_AMPERE;
         
     | 
| 
      
 4720 
     | 
    
         
            +
                    nwarps = NWARPS_Q3_K_AMPERE;
         
     | 
| 
      
 4721 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4722 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q3_K_PASCAL;
         
     | 
| 
      
 4723 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q3_K_PASCAL;
         
     | 
| 
      
 4724 
     | 
    
         
            +
                    nwarps = NWARPS_Q3_K_PASCAL;
         
     | 
| 
       4285 
4725 
     | 
    
         
             
                } else {
         
     | 
| 
       4286 
     | 
    
         
            -
                     
     | 
| 
       4287 
     | 
    
         
            -
             
     | 
| 
       4288 
     | 
    
         
            -
             
     | 
| 
       4289 
     | 
    
         
            -
             
     | 
| 
       4290 
     | 
    
         
            -
             
     | 
| 
       4291 
     | 
    
         
            -
             
     | 
| 
       4292 
     | 
    
         
            -
             
     | 
| 
       4293 
     | 
    
         
            -
             
     | 
| 
       4294 
     | 
    
         
            -
             
     | 
| 
       4295 
     | 
    
         
            -
                     
     | 
| 
       4296 
     | 
    
         
            -
             
     | 
| 
       4297 
     | 
    
         
            -
                         
     | 
| 
       4298 
     | 
    
         
            -
             
     | 
| 
       4299 
     | 
    
         
            -
             
     | 
| 
       4300 
     | 
    
         
            -
                     
     | 
| 
       4301 
     | 
    
         
            -
                         
     | 
| 
       4302 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         
     | 
| 
       4303 
     | 
    
         
            -
                            load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
         
     | 
| 
       4304 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4305 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4726 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4727 
     | 
    
         
            +
                }
         
     | 
| 
      
 4728 
     | 
    
         
            +
             
     | 
| 
      
 4729 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4730 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4731 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4732 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4733 
     | 
    
         
            +
             
     | 
| 
      
 4734 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4735 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4736 
     | 
    
         
            +
                    mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4737 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4738 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4739 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4740 
     | 
    
         
            +
                    mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4741 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4306 
4742 
     | 
    
         
             
                }
         
     | 
| 
       4307 
4743 
     | 
    
         
             
            }
         
     | 
| 
       4308 
4744 
     | 
    
         | 
| 
         @@ -4314,48 +4750,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( 
     | 
|
| 
       4314 
4750 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       4315 
4751 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       4316 
4752 
     | 
    
         | 
| 
      
 4753 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       4317 
4754 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       4318 
     | 
    
         
            -
                     
     | 
| 
       4319 
     | 
    
         
            -
                     
     | 
| 
       4320 
     | 
    
         
            -
                     
     | 
| 
       4321 
     | 
    
         
            -
             
     | 
| 
       4322 
     | 
    
         
            -
                     
     | 
| 
       4323 
     | 
    
         
            -
                     
     | 
| 
       4324 
     | 
    
         
            -
                     
     | 
| 
       4325 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4326 
     | 
    
         
            -
             
     | 
| 
       4327 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4328 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4329 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         
     | 
| 
       4330 
     | 
    
         
            -
                            load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
         
     | 
| 
       4331 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4332 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4333 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4334 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         
     | 
| 
       4335 
     | 
    
         
            -
                            load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
         
     | 
| 
       4336 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4337 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4755 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q4_K_AMPERE;
         
     | 
| 
      
 4756 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q4_K_AMPERE;
         
     | 
| 
      
 4757 
     | 
    
         
            +
                    nwarps = NWARPS_Q4_K_AMPERE;
         
     | 
| 
      
 4758 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4759 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q4_K_PASCAL;
         
     | 
| 
      
 4760 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q4_K_PASCAL;
         
     | 
| 
      
 4761 
     | 
    
         
            +
                    nwarps = NWARPS_Q4_K_PASCAL;
         
     | 
| 
       4338 
4762 
     | 
    
         
             
                } else {
         
     | 
| 
       4339 
     | 
    
         
            -
                     
     | 
| 
       4340 
     | 
    
         
            -
             
     | 
| 
       4341 
     | 
    
         
            -
             
     | 
| 
       4342 
     | 
    
         
            -
             
     | 
| 
       4343 
     | 
    
         
            -
             
     | 
| 
       4344 
     | 
    
         
            -
             
     | 
| 
       4345 
     | 
    
         
            -
             
     | 
| 
       4346 
     | 
    
         
            -
             
     | 
| 
       4347 
     | 
    
         
            -
             
     | 
| 
       4348 
     | 
    
         
            -
                     
     | 
| 
       4349 
     | 
    
         
            -
             
     | 
| 
       4350 
     | 
    
         
            -
                         
     | 
| 
       4351 
     | 
    
         
            -
             
     | 
| 
       4352 
     | 
    
         
            -
             
     | 
| 
       4353 
     | 
    
         
            -
                     
     | 
| 
       4354 
     | 
    
         
            -
                         
     | 
| 
       4355 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         
     | 
| 
       4356 
     | 
    
         
            -
                            load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
         
     | 
| 
       4357 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4358 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4763 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4764 
     | 
    
         
            +
                }
         
     | 
| 
      
 4765 
     | 
    
         
            +
             
     | 
| 
      
 4766 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4767 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4768 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4769 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4770 
     | 
    
         
            +
             
     | 
| 
      
 4771 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4772 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4773 
     | 
    
         
            +
                    mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4774 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4775 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4776 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4777 
     | 
    
         
            +
                    mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4778 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4359 
4779 
     | 
    
         
             
                }
         
     | 
| 
       4360 
4780 
     | 
    
         
             
            }
         
     | 
| 
       4361 
4781 
     | 
    
         | 
| 
         @@ -4367,48 +4787,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( 
     | 
|
| 
       4367 
4787 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       4368 
4788 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       4369 
4789 
     | 
    
         | 
| 
      
 4790 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       4370 
4791 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       4371 
     | 
    
         
            -
                     
     | 
| 
       4372 
     | 
    
         
            -
                     
     | 
| 
       4373 
     | 
    
         
            -
                     
     | 
| 
       4374 
     | 
    
         
            -
             
     | 
| 
       4375 
     | 
    
         
            -
                     
     | 
| 
       4376 
     | 
    
         
            -
                     
     | 
| 
       4377 
     | 
    
         
            -
                     
     | 
| 
       4378 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4379 
     | 
    
         
            -
             
     | 
| 
       4380 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4381 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4382 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         
     | 
| 
       4383 
     | 
    
         
            -
                            load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
         
     | 
| 
       4384 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4385 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4386 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4387 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         
     | 
| 
       4388 
     | 
    
         
            -
                            load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
         
     | 
| 
       4389 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4390 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4792 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q5_K_AMPERE;
         
     | 
| 
      
 4793 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q5_K_AMPERE;
         
     | 
| 
      
 4794 
     | 
    
         
            +
                    nwarps = NWARPS_Q5_K_AMPERE;
         
     | 
| 
      
 4795 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4796 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q5_K_PASCAL;
         
     | 
| 
      
 4797 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q5_K_PASCAL;
         
     | 
| 
      
 4798 
     | 
    
         
            +
                    nwarps = NWARPS_Q5_K_PASCAL;
         
     | 
| 
       4391 
4799 
     | 
    
         
             
                } else {
         
     | 
| 
       4392 
     | 
    
         
            -
                     
     | 
| 
       4393 
     | 
    
         
            -
             
     | 
| 
       4394 
     | 
    
         
            -
             
     | 
| 
       4395 
     | 
    
         
            -
             
     | 
| 
       4396 
     | 
    
         
            -
             
     | 
| 
       4397 
     | 
    
         
            -
             
     | 
| 
       4398 
     | 
    
         
            -
             
     | 
| 
       4399 
     | 
    
         
            -
             
     | 
| 
       4400 
     | 
    
         
            -
             
     | 
| 
       4401 
     | 
    
         
            -
                     
     | 
| 
       4402 
     | 
    
         
            -
             
     | 
| 
       4403 
     | 
    
         
            -
                         
     | 
| 
       4404 
     | 
    
         
            -
             
     | 
| 
       4405 
     | 
    
         
            -
             
     | 
| 
       4406 
     | 
    
         
            -
                     
     | 
| 
       4407 
     | 
    
         
            -
                         
     | 
| 
       4408 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         
     | 
| 
       4409 
     | 
    
         
            -
                            load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
         
     | 
| 
       4410 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4411 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4800 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4801 
     | 
    
         
            +
                }
         
     | 
| 
      
 4802 
     | 
    
         
            +
             
     | 
| 
      
 4803 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4804 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4805 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4806 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4807 
     | 
    
         
            +
             
     | 
| 
      
 4808 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4809 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4810 
     | 
    
         
            +
                    mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4811 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4812 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4813 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4814 
     | 
    
         
            +
                    mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4815 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4412 
4816 
     | 
    
         
             
                }
         
     | 
| 
       4413 
4817 
     | 
    
         
             
            }
         
     | 
| 
       4414 
4818 
     | 
    
         | 
| 
         @@ -4420,48 +4824,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( 
     | 
|
| 
       4420 
4824 
     | 
    
         
             
                CUDA_CHECK(cudaGetDevice(&id));
         
     | 
| 
       4421 
4825 
     | 
    
         
             
                const int compute_capability = g_compute_capabilities[id];
         
     | 
| 
       4422 
4826 
     | 
    
         | 
| 
      
 4827 
     | 
    
         
            +
                int mmq_x, mmq_y, nwarps;
         
     | 
| 
       4423 
4828 
     | 
    
         
             
                if (compute_capability >= CC_TURING) {
         
     | 
| 
       4424 
     | 
    
         
            -
                     
     | 
| 
       4425 
     | 
    
         
            -
                     
     | 
| 
       4426 
     | 
    
         
            -
                     
     | 
| 
       4427 
     | 
    
         
            -
             
     | 
| 
       4428 
     | 
    
         
            -
                     
     | 
| 
       4429 
     | 
    
         
            -
                     
     | 
| 
       4430 
     | 
    
         
            -
                     
     | 
| 
       4431 
     | 
    
         
            -
                    const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
       4432 
     | 
    
         
            -
             
     | 
| 
       4433 
     | 
    
         
            -
                    if (nrows_x % mmq_y == 0) {
         
     | 
| 
       4434 
     | 
    
         
            -
                        const bool need_check = false;
         
     | 
| 
       4435 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         
     | 
| 
       4436 
     | 
    
         
            -
                            load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
         
     | 
| 
       4437 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4438 
     | 
    
         
            -
                    } else {
         
     | 
| 
       4439 
     | 
    
         
            -
                        const bool need_check = true;
         
     | 
| 
       4440 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         
     | 
| 
       4441 
     | 
    
         
            -
                            load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
         
     | 
| 
       4442 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4443 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4829 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q6_K_AMPERE;
         
     | 
| 
      
 4830 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q6_K_AMPERE;
         
     | 
| 
      
 4831 
     | 
    
         
            +
                    nwarps = NWARPS_Q6_K_AMPERE;
         
     | 
| 
      
 4832 
     | 
    
         
            +
                } else if (compute_capability >= MIN_CC_DP4A) {
         
     | 
| 
      
 4833 
     | 
    
         
            +
                    mmq_x  =  MMQ_X_Q6_K_PASCAL;
         
     | 
| 
      
 4834 
     | 
    
         
            +
                    mmq_y  =  MMQ_Y_Q6_K_PASCAL;
         
     | 
| 
      
 4835 
     | 
    
         
            +
                    nwarps = NWARPS_Q6_K_PASCAL;
         
     | 
| 
       4444 
4836 
     | 
    
         
             
                } else {
         
     | 
| 
       4445 
     | 
    
         
            -
                     
     | 
| 
       4446 
     | 
    
         
            -
             
     | 
| 
       4447 
     | 
    
         
            -
             
     | 
| 
       4448 
     | 
    
         
            -
             
     | 
| 
       4449 
     | 
    
         
            -
             
     | 
| 
       4450 
     | 
    
         
            -
             
     | 
| 
       4451 
     | 
    
         
            -
             
     | 
| 
       4452 
     | 
    
         
            -
             
     | 
| 
       4453 
     | 
    
         
            -
             
     | 
| 
       4454 
     | 
    
         
            -
                     
     | 
| 
       4455 
     | 
    
         
            -
             
     | 
| 
       4456 
     | 
    
         
            -
                         
     | 
| 
       4457 
     | 
    
         
            -
             
     | 
| 
       4458 
     | 
    
         
            -
             
     | 
| 
       4459 
     | 
    
         
            -
                     
     | 
| 
       4460 
     | 
    
         
            -
                         
     | 
| 
       4461 
     | 
    
         
            -
                        mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         
     | 
| 
       4462 
     | 
    
         
            -
                            load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
         
     | 
| 
       4463 
     | 
    
         
            -
                            <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4464 
     | 
    
         
            -
                    }
         
     | 
| 
      
 4837 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 4838 
     | 
    
         
            +
                }
         
     | 
| 
      
 4839 
     | 
    
         
            +
             
     | 
| 
      
 4840 
     | 
    
         
            +
                const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
         
     | 
| 
      
 4841 
     | 
    
         
            +
                const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
         
     | 
| 
      
 4842 
     | 
    
         
            +
                const dim3 block_nums(block_num_x, block_num_y, 1);
         
     | 
| 
      
 4843 
     | 
    
         
            +
                const dim3 block_dims(WARP_SIZE, nwarps, 1);
         
     | 
| 
      
 4844 
     | 
    
         
            +
             
     | 
| 
      
 4845 
     | 
    
         
            +
                if (nrows_x % mmq_y == 0) {
         
     | 
| 
      
 4846 
     | 
    
         
            +
                    const bool need_check = false;
         
     | 
| 
      
 4847 
     | 
    
         
            +
                    mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4848 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
      
 4849 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 4850 
     | 
    
         
            +
                    const bool need_check = true;
         
     | 
| 
      
 4851 
     | 
    
         
            +
                    mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
         
     | 
| 
      
 4852 
     | 
    
         
            +
                        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
         
     | 
| 
       4465 
4853 
     | 
    
         
             
                }
         
     | 
| 
       4466 
4854 
     | 
    
         
             
            }
         
     | 
| 
       4467 
4855 
     | 
    
         | 
| 
         @@ -4511,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons 
     | 
|
| 
       4511 
4899 
     | 
    
         | 
| 
       4512 
4900 
     | 
    
         
             
            static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
         
     | 
| 
       4513 
4901 
     | 
    
         
             
                                      const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
         
     | 
| 
       4514 
     | 
    
         
            -
                GGML_ASSERT(nrows % 2 == 0);
         
     | 
| 
       4515 
     | 
    
         
            -
                const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1 
     | 
| 
      
 4902 
     | 
    
         
            +
                GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
         
     | 
| 
      
 4903 
     | 
    
         
            +
                const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
         
     | 
| 
       4516 
4904 
     | 
    
         
             
                const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
         
     | 
| 
       4517 
     | 
    
         
            -
                const dim3 block_nums( 
     | 
| 
      
 4905 
     | 
    
         
            +
                const dim3 block_nums(nrows, num_blocks_x, 1);
         
     | 
| 
       4518 
4906 
     | 
    
         
             
                rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
         
     | 
| 
       4519 
4907 
     | 
    
         
             
            }
         
     | 
| 
       4520 
4908 
     | 
    
         | 
| 
      
 4909 
     | 
    
         
            +
            static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
         
     | 
| 
      
 4910 
     | 
    
         
            +
                                      const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
         
     | 
| 
      
 4911 
     | 
    
         
            +
                const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
         
     | 
| 
      
 4912 
     | 
    
         
            +
                const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
         
     | 
| 
      
 4913 
     | 
    
         
            +
                const dim3 block_nums(nrows, num_blocks_x, 1);
         
     | 
| 
      
 4914 
     | 
    
         
            +
                rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
         
     | 
| 
      
 4915 
     | 
    
         
            +
            }
         
     | 
| 
      
 4916 
     | 
    
         
            +
             
     | 
| 
       4521 
4917 
     | 
    
         
             
            static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
         
     | 
| 
       4522 
4918 
     | 
    
         
             
                GGML_ASSERT(nrows % 4 == 0);
         
     | 
| 
       4523 
4919 
     | 
    
         
             
                const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
         
     | 
| 
         @@ -4526,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con 
     | 
|
| 
       4526 
4922 
     | 
    
         
             
                rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
         
     | 
| 
       4527 
4923 
     | 
    
         
             
            }
         
     | 
| 
       4528 
4924 
     | 
    
         | 
| 
      
 4925 
     | 
    
         
            +
            static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
         
     | 
| 
      
 4926 
     | 
    
         
            +
                                       const int k_rows, const int n_heads_log2_floor, const float m0,
         
     | 
| 
      
 4927 
     | 
    
         
            +
                                       const float m1, cudaStream_t stream) {
         
     | 
| 
      
 4928 
     | 
    
         
            +
                const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
         
     | 
| 
      
 4929 
     | 
    
         
            +
                const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
         
     | 
| 
      
 4930 
     | 
    
         
            +
                const dim3 block_nums(num_blocks_x, nrows, 1);
         
     | 
| 
      
 4931 
     | 
    
         
            +
                alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
         
     | 
| 
      
 4932 
     | 
    
         
            +
            }
         
     | 
| 
      
 4933 
     | 
    
         
            +
             
     | 
| 
       4529 
4934 
     | 
    
         
             
            static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
         
     | 
| 
       4530 
     | 
    
         
            -
                const dim3 block_dims( 
     | 
| 
      
 4935 
     | 
    
         
            +
                const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
         
     | 
| 
       4531 
4936 
     | 
    
         
             
                const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
         
     | 
| 
       4532 
     | 
    
         
            -
                const dim3 block_nums( 
     | 
| 
      
 4937 
     | 
    
         
            +
                const dim3 block_nums(nrows_x, block_num_x, 1);
         
     | 
| 
       4533 
4938 
     | 
    
         
             
                diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
         
     | 
| 
       4534 
4939 
     | 
    
         
             
            }
         
     | 
| 
       4535 
4940 
     | 
    
         | 
| 
       4536 
4941 
     | 
    
         
             
            static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
         
     | 
| 
       4537 
     | 
    
         
            -
                const dim3 block_dims( 
     | 
| 
       4538 
     | 
    
         
            -
                const dim3 block_nums( 
     | 
| 
      
 4942 
     | 
    
         
            +
                const dim3 block_dims(1, WARP_SIZE, 1);
         
     | 
| 
      
 4943 
     | 
    
         
            +
                const dim3 block_nums(nrows_x, 1, 1);
         
     | 
| 
       4539 
4944 
     | 
    
         
             
                soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
         
     | 
| 
       4540 
4945 
     | 
    
         
             
            }
         
     | 
| 
       4541 
4946 
     | 
    
         | 
| 
         @@ -4640,10 +5045,18 @@ void ggml_init_cublas() { 
     | 
|
| 
       4640 
5045 
     | 
    
         
             
                static bool initialized = false;
         
     | 
| 
       4641 
5046 
     | 
    
         | 
| 
       4642 
5047 
     | 
    
         
             
                if (!initialized) {
         
     | 
| 
      
 5048 
     | 
    
         
            +
             
     | 
| 
      
 5049 
     | 
    
         
            +
            #ifdef __HIP_PLATFORM_AMD__
         
     | 
| 
      
 5050 
     | 
    
         
            +
                    // Workaround for a rocBLAS bug when using multiple graphics cards:
         
     | 
| 
      
 5051 
     | 
    
         
            +
                    // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
         
     | 
| 
      
 5052 
     | 
    
         
            +
                    rocblas_initialize();
         
     | 
| 
      
 5053 
     | 
    
         
            +
                    CUDA_CHECK(cudaDeviceSynchronize());
         
     | 
| 
      
 5054 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 5055 
     | 
    
         
            +
             
     | 
| 
       4643 
5056 
     | 
    
         
             
                    CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
         
     | 
| 
       4644 
5057 
     | 
    
         
             
                    GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
         
     | 
| 
       4645 
5058 
     | 
    
         
             
                    int64_t total_vram = 0;
         
     | 
| 
       4646 
     | 
    
         
            -
                    fprintf(stderr, "%s: found %d  
     | 
| 
      
 5059 
     | 
    
         
            +
                    fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
         
     | 
| 
       4647 
5060 
     | 
    
         
             
                    for (int id = 0; id < g_device_count; ++id) {
         
     | 
| 
       4648 
5061 
     | 
    
         
             
                        cudaDeviceProp prop;
         
     | 
| 
       4649 
5062 
     | 
    
         
             
                        CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
         
     | 
| 
         @@ -5241,7 +5654,8 @@ inline void ggml_cuda_op_rope( 
     | 
|
| 
       5241 
5654 
     | 
    
         | 
| 
       5242 
5655 
     | 
    
         
             
                const float theta_scale = powf(freq_base, -2.0f/n_dims);
         
     | 
| 
       5243 
5656 
     | 
    
         | 
| 
       5244 
     | 
    
         
            -
                const bool  
     | 
| 
      
 5657 
     | 
    
         
            +
                const bool is_neox = mode & 2;
         
     | 
| 
      
 5658 
     | 
    
         
            +
                const bool is_glm  = mode & 4;
         
     | 
| 
       5245 
5659 
     | 
    
         | 
| 
       5246 
5660 
     | 
    
         
             
                // compute
         
     | 
| 
       5247 
5661 
     | 
    
         
             
                if (is_glm) {
         
     | 
| 
         @@ -5249,6 +5663,10 @@ inline void ggml_cuda_op_rope( 
     | 
|
| 
       5249 
5663 
     | 
    
         
             
                    const float id_p = min(p, n_ctx - 2.f);
         
     | 
| 
       5250 
5664 
     | 
    
         
             
                    const float block_p = max(p - (n_ctx - 2.f), 0.f);
         
     | 
| 
       5251 
5665 
     | 
    
         
             
                    rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
         
     | 
| 
      
 5666 
     | 
    
         
            +
                } else if (is_neox) {
         
     | 
| 
      
 5667 
     | 
    
         
            +
                    GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
         
     | 
| 
      
 5668 
     | 
    
         
            +
                    const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
         
     | 
| 
      
 5669 
     | 
    
         
            +
                    rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
         
     | 
| 
       5252 
5670 
     | 
    
         
             
                } else {
         
     | 
| 
       5253 
5671 
     | 
    
         
             
                    const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
         
     | 
| 
       5254 
5672 
     | 
    
         
             
                    rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
         
     | 
| 
         @@ -5261,6 +5679,41 @@ inline void ggml_cuda_op_rope( 
     | 
|
| 
       5261 
5679 
     | 
    
         
             
                (void) i1;
         
     | 
| 
       5262 
5680 
     | 
    
         
             
            }
         
     | 
| 
       5263 
5681 
     | 
    
         | 
| 
      
 5682 
     | 
    
         
            +
            inline void ggml_cuda_op_alibi(
         
     | 
| 
      
 5683 
     | 
    
         
            +
                const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
         
     | 
| 
      
 5684 
     | 
    
         
            +
                float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
         
     | 
| 
      
 5685 
     | 
    
         
            +
                cudaStream_t & cudaStream_main){
         
     | 
| 
      
 5686 
     | 
    
         
            +
             
     | 
| 
      
 5687 
     | 
    
         
            +
                GGML_ASSERT(src0_ddf_i != nullptr);
         
     | 
| 
      
 5688 
     | 
    
         
            +
                GGML_ASSERT(dst_ddf_i != nullptr);
         
     | 
| 
      
 5689 
     | 
    
         
            +
             
     | 
| 
      
 5690 
     | 
    
         
            +
                const int64_t ne00 = src0->ne[0];
         
     | 
| 
      
 5691 
     | 
    
         
            +
                const int64_t ne01 = src0->ne[1];
         
     | 
| 
      
 5692 
     | 
    
         
            +
                const int64_t ne02 = src0->ne[2];
         
     | 
| 
      
 5693 
     | 
    
         
            +
                const int64_t i01_diff = i01_high - i01_low;
         
     | 
| 
      
 5694 
     | 
    
         
            +
             
     | 
| 
      
 5695 
     | 
    
         
            +
                const int n_past = ((int32_t *) dst->op_params)[0];
         
     | 
| 
      
 5696 
     | 
    
         
            +
                const int n_head = ((int32_t *) dst->op_params)[1];
         
     | 
| 
      
 5697 
     | 
    
         
            +
                float max_bias;
         
     | 
| 
      
 5698 
     | 
    
         
            +
                memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
         
     | 
| 
      
 5699 
     | 
    
         
            +
             
     | 
| 
      
 5700 
     | 
    
         
            +
                GGML_ASSERT(ne01 + n_past == ne00);
         
     | 
| 
      
 5701 
     | 
    
         
            +
                GGML_ASSERT(n_head == ne02);
         
     | 
| 
      
 5702 
     | 
    
         
            +
             
     | 
| 
      
 5703 
     | 
    
         
            +
                const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
         
     | 
| 
      
 5704 
     | 
    
         
            +
             
     | 
| 
      
 5705 
     | 
    
         
            +
                const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
         
     | 
| 
      
 5706 
     | 
    
         
            +
                const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
         
     | 
| 
      
 5707 
     | 
    
         
            +
             
     | 
| 
      
 5708 
     | 
    
         
            +
                // compute
         
     | 
| 
      
 5709 
     | 
    
         
            +
                alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
         
     | 
| 
      
 5710 
     | 
    
         
            +
             
     | 
| 
      
 5711 
     | 
    
         
            +
                (void) src1;
         
     | 
| 
      
 5712 
     | 
    
         
            +
                (void) src0_ddq_i;
         
     | 
| 
      
 5713 
     | 
    
         
            +
                (void) src1_ddf_i;
         
     | 
| 
      
 5714 
     | 
    
         
            +
                (void) i1;
         
     | 
| 
      
 5715 
     | 
    
         
            +
            }
         
     | 
| 
      
 5716 
     | 
    
         
            +
             
     | 
| 
       5264 
5717 
     | 
    
         
             
            inline void ggml_cuda_op_diag_mask_inf(
         
     | 
| 
       5265 
5718 
     | 
    
         
             
                const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
         
     | 
| 
       5266 
5719 
     | 
    
         
             
                float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
         
     | 
| 
         @@ -5881,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten 
     | 
|
| 
       5881 
6334 
     | 
    
         
             
                ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
         
     | 
| 
       5882 
6335 
     | 
    
         
             
            }
         
     | 
| 
       5883 
6336 
     | 
    
         | 
| 
      
 6337 
     | 
    
         
            +
            void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
         
     | 
| 
      
 6338 
     | 
    
         
            +
                GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
         
     | 
| 
      
 6339 
     | 
    
         
            +
                ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
         
     | 
| 
      
 6340 
     | 
    
         
            +
            }
         
     | 
| 
      
 6341 
     | 
    
         
            +
             
     | 
| 
       5884 
6342 
     | 
    
         
             
            void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
         
     | 
| 
       5885 
6343 
     | 
    
         
             
                (void) src0;
         
     | 
| 
       5886 
6344 
     | 
    
         
             
                (void) src1;
         
     | 
| 
         @@ -6000,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { 
     | 
|
| 
       6000 
6458 
     | 
    
         
             
                return extra;
         
     | 
| 
       6001 
6459 
     | 
    
         
             
            }
         
     | 
| 
       6002 
6460 
     | 
    
         | 
| 
       6003 
     | 
    
         
            -
            void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
         
     | 
| 
      
 6461 
     | 
    
         
            +
            void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
         
     | 
| 
       6004 
6462 
     | 
    
         
             
                if (scratch && g_scratch_size == 0) {
         
     | 
| 
       6005 
6463 
     | 
    
         
             
                    return;
         
     | 
| 
       6006 
6464 
     | 
    
         
             
                }
         
     | 
| 
         @@ -6009,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo 
     | 
|
| 
       6009 
6467 
     | 
    
         
             
                if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
         
     | 
| 
       6010 
6468 
     | 
    
         
             
                    const ggml_op src0_op = tensor->src[0]->op;
         
     | 
| 
       6011 
6469 
     | 
    
         
             
                    if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
         
     | 
| 
       6012 
     | 
    
         
            -
                        ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
         
     | 
| 
      
 6470 
     | 
    
         
            +
                        ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
         
     | 
| 
       6013 
6471 
     | 
    
         
             
                    }
         
     | 
| 
       6014 
6472 
     | 
    
         
             
                }
         
     | 
| 
       6015 
6473 
     | 
    
         
             
                if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
         
     | 
| 
       6016 
     | 
    
         
            -
                    ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
         
     | 
| 
      
 6474 
     | 
    
         
            +
                    ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
         
     | 
| 
       6017 
6475 
     | 
    
         
             
                }
         
     | 
| 
       6018 
6476 
     | 
    
         | 
| 
       6019 
6477 
     | 
    
         
             
                tensor->backend = GGML_BACKEND_GPU;
         
     | 
| 
      
 6478 
     | 
    
         
            +
             
     | 
| 
      
 6479 
     | 
    
         
            +
                if (scratch && no_alloc) {
         
     | 
| 
      
 6480 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 6481 
     | 
    
         
            +
                }
         
     | 
| 
      
 6482 
     | 
    
         
            +
             
     | 
| 
       6020 
6483 
     | 
    
         
             
                struct ggml_tensor_extra_gpu * extra;
         
     | 
| 
       6021 
6484 
     | 
    
         | 
| 
       6022 
6485 
     | 
    
         
             
                const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         
     | 
| 
         @@ -6068,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo 
     | 
|
| 
       6068 
6531 
     | 
    
         
             
                tensor->extra = extra;
         
     | 
| 
       6069 
6532 
     | 
    
         
             
            }
         
     | 
| 
       6070 
6533 
     | 
    
         | 
| 
      
 6534 
     | 
    
         
            +
            void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
         
     | 
| 
      
 6535 
     | 
    
         
            +
                if (g_scratch_size == 0) {
         
     | 
| 
      
 6536 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 6537 
     | 
    
         
            +
                }
         
     | 
| 
      
 6538 
     | 
    
         
            +
                if (g_scratch_buffer == nullptr) {
         
     | 
| 
      
 6539 
     | 
    
         
            +
                    CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
         
     | 
| 
      
 6540 
     | 
    
         
            +
                }
         
     | 
| 
      
 6541 
     | 
    
         
            +
             
     | 
| 
      
 6542 
     | 
    
         
            +
                struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
         
     | 
| 
      
 6543 
     | 
    
         
            +
             
     | 
| 
      
 6544 
     | 
    
         
            +
                const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         
     | 
| 
      
 6545 
     | 
    
         
            +
                    tensor->op == GGML_OP_VIEW;
         
     | 
| 
      
 6546 
     | 
    
         
            +
             
     | 
| 
      
 6547 
     | 
    
         
            +
                if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
         
     | 
| 
      
 6548 
     | 
    
         
            +
                    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         
     | 
| 
      
 6549 
     | 
    
         
            +
                    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         
     | 
| 
      
 6550 
     | 
    
         
            +
                    size_t view_offset = 0;
         
     | 
| 
      
 6551 
     | 
    
         
            +
                    if (tensor->op == GGML_OP_VIEW) {
         
     | 
| 
      
 6552 
     | 
    
         
            +
                        memcpy(&view_offset, tensor->op_params, sizeof(size_t));
         
     | 
| 
      
 6553 
     | 
    
         
            +
                    }
         
     | 
| 
      
 6554 
     | 
    
         
            +
                    extra->data_device[g_main_device] = src0_ddc + view_offset;
         
     | 
| 
      
 6555 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 6556 
     | 
    
         
            +
                    extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
         
     | 
| 
      
 6557 
     | 
    
         
            +
                }
         
     | 
| 
      
 6558 
     | 
    
         
            +
             
     | 
| 
      
 6559 
     | 
    
         
            +
                tensor->extra = extra;
         
     | 
| 
      
 6560 
     | 
    
         
            +
            }
         
     | 
| 
      
 6561 
     | 
    
         
            +
             
     | 
| 
       6071 
6562 
     | 
    
         
             
            void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
         
     | 
| 
       6072 
     | 
    
         
            -
                ggml_cuda_assign_buffers_impl(tensor, true, false);
         
     | 
| 
      
 6563 
     | 
    
         
            +
                ggml_cuda_assign_buffers_impl(tensor, true, false, false);
         
     | 
| 
      
 6564 
     | 
    
         
            +
            }
         
     | 
| 
      
 6565 
     | 
    
         
            +
             
     | 
| 
      
 6566 
     | 
    
         
            +
            void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
         
     | 
| 
      
 6567 
     | 
    
         
            +
                ggml_cuda_assign_buffers_impl(tensor, true, false, true);
         
     | 
| 
       6073 
6568 
     | 
    
         
             
            }
         
     | 
| 
       6074 
6569 
     | 
    
         | 
| 
       6075 
6570 
     | 
    
         
             
            void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
         
     | 
| 
       6076 
     | 
    
         
            -
                ggml_cuda_assign_buffers_impl(tensor, false, false);
         
     | 
| 
      
 6571 
     | 
    
         
            +
                ggml_cuda_assign_buffers_impl(tensor, false, false, false);
         
     | 
| 
       6077 
6572 
     | 
    
         
             
            }
         
     | 
| 
       6078 
6573 
     | 
    
         | 
| 
       6079 
6574 
     | 
    
         
             
            void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
         
     | 
| 
       6080 
     | 
    
         
            -
                ggml_cuda_assign_buffers_impl(tensor, false, true);
         
     | 
| 
      
 6575 
     | 
    
         
            +
                ggml_cuda_assign_buffers_impl(tensor, false, true, false);
         
     | 
| 
       6081 
6576 
     | 
    
         
             
            }
         
     | 
| 
       6082 
6577 
     | 
    
         | 
| 
       6083 
6578 
     | 
    
         
             
            void ggml_cuda_set_main_device(int main_device) {
         
     | 
| 
         @@ -6216,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ 
     | 
|
| 
       6216 
6711 
     | 
    
         
             
                        }
         
     | 
| 
       6217 
6712 
     | 
    
         
             
                        func = ggml_cuda_rope;
         
     | 
| 
       6218 
6713 
     | 
    
         
             
                        break;
         
     | 
| 
      
 6714 
     | 
    
         
            +
                    case GGML_OP_ALIBI:
         
     | 
| 
      
 6715 
     | 
    
         
            +
                        if (!any_on_device) {
         
     | 
| 
      
 6716 
     | 
    
         
            +
                            return false;
         
     | 
| 
      
 6717 
     | 
    
         
            +
                        }
         
     | 
| 
      
 6718 
     | 
    
         
            +
                        func = ggml_cuda_alibi;
         
     | 
| 
      
 6719 
     | 
    
         
            +
                        break;
         
     | 
| 
       6219 
6720 
     | 
    
         
             
                    default:
         
     | 
| 
       6220 
6721 
     | 
    
         
             
                        return false;
         
     | 
| 
       6221 
6722 
     | 
    
         
             
                }
         
     | 
| 
         @@ -6229,3 +6730,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ 
     | 
|
| 
       6229 
6730 
     | 
    
         
             
                func(tensor->src[0], tensor->src[1], tensor);
         
     | 
| 
       6230 
6731 
     | 
    
         
             
                return true;
         
     | 
| 
       6231 
6732 
     | 
    
         
             
            }
         
     | 
| 
      
 6733 
     | 
    
         
            +
             
     | 
| 
      
 6734 
     | 
    
         
            +
            int ggml_cuda_get_device_count() {
         
     | 
| 
      
 6735 
     | 
    
         
            +
                int device_count;
         
     | 
| 
      
 6736 
     | 
    
         
            +
                CUDA_CHECK(cudaGetDeviceCount(&device_count));
         
     | 
| 
      
 6737 
     | 
    
         
            +
                return device_count;
         
     | 
| 
      
 6738 
     | 
    
         
            +
            }
         
     | 
| 
      
 6739 
     | 
    
         
            +
             
     | 
| 
      
 6740 
     | 
    
         
            +
            void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
         
     | 
| 
      
 6741 
     | 
    
         
            +
                cudaDeviceProp prop;
         
     | 
| 
      
 6742 
     | 
    
         
            +
                CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
         
     | 
| 
      
 6743 
     | 
    
         
            +
                snprintf(description, description_size, "%s", prop.name);
         
     | 
| 
      
 6744 
     | 
    
         
            +
            }
         
     |