llama_cpp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
| @@ -13,6 +13,10 @@ | |
| 13 13 | 
             
            #include "ggml-cuda.h"
         | 
| 14 14 | 
             
            #include "ggml.h"
         | 
| 15 15 |  | 
| 16 | 
            +
            #if defined(_MSC_VER)
         | 
| 17 | 
            +
            #pragma warning(disable: 4244 4267) // possible loss of data
         | 
| 18 | 
            +
            #endif
         | 
| 19 | 
            +
             | 
| 16 20 | 
             
            static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
         | 
| 17 21 |  | 
| 18 22 | 
             
            #define CUDA_CHECK(err)                                                                 \
         | 
| @@ -46,7 +50,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); | |
| 46 50 | 
             
                } while (0)
         | 
| 47 51 | 
             
            #endif // CUDART_VERSION >= 11
         | 
| 48 52 |  | 
| 49 | 
            -
             | 
| 53 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 54 | 
            +
            typedef half dfloat; // dequantize float
         | 
| 55 | 
            +
            typedef half2 dfloat2;
         | 
| 56 | 
            +
            #else
         | 
| 57 | 
            +
            typedef float dfloat; // dequantize float
         | 
| 58 | 
            +
            typedef float2 dfloat2;
         | 
| 59 | 
            +
            #endif //GGML_CUDA_DMMV_F16
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
         | 
| 50 62 | 
             
            typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
         | 
| 51 63 | 
             
            typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
         | 
| 52 64 | 
             
            typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
         | 
| @@ -105,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo | |
| 105 117 |  | 
| 106 118 | 
             
            //================================= k-quants
         | 
| 107 119 |  | 
| 120 | 
            +
            #ifdef GGML_QKK_64
         | 
| 121 | 
            +
            #define QK_K 64
         | 
| 122 | 
            +
            #define K_SCALE_SIZE 4
         | 
| 123 | 
            +
            #else
         | 
| 108 124 | 
             
            #define QK_K 256
         | 
| 125 | 
            +
            #define K_SCALE_SIZE 12
         | 
| 126 | 
            +
            #endif
         | 
| 109 127 |  | 
| 110 128 | 
             
            typedef struct {
         | 
| 111 129 | 
             
                uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
         | 
| @@ -116,13 +134,25 @@ typedef struct { | |
| 116 134 | 
             
            static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
         | 
| 117 135 |  | 
| 118 136 | 
             
            typedef struct {
         | 
| 119 | 
            -
                uint8_t hmask[QK_K/8];
         | 
| 120 | 
            -
                uint8_t qs[QK_K/4]; | 
| 121 | 
            -
             | 
| 122 | 
            -
                 | 
| 137 | 
            +
                uint8_t hmask[QK_K/8];     // quants - high bit
         | 
| 138 | 
            +
                uint8_t qs[QK_K/4];        // quants - low 2 bits
         | 
| 139 | 
            +
            #ifdef GGML_QKK_64
         | 
| 140 | 
            +
                uint8_t scales[2]; // scales, quantized with 8 bits
         | 
| 141 | 
            +
            #else
         | 
| 142 | 
            +
                uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
         | 
| 143 | 
            +
            #endif
         | 
| 144 | 
            +
                half d;             // super-block scale
         | 
| 123 145 | 
             
            } block_q3_K;
         | 
| 124 | 
            -
            static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 +  | 
| 146 | 
            +
            //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
         | 
| 125 147 |  | 
| 148 | 
            +
            #ifdef GGML_QKK_64
         | 
| 149 | 
            +
            typedef struct {
         | 
| 150 | 
            +
                half    d[2];              // super-block scales/mins
         | 
| 151 | 
            +
                uint8_t scales[2];         // 4-bit block scales/mins
         | 
| 152 | 
            +
                uint8_t qs[QK_K/2];        // 4--bit quants
         | 
| 153 | 
            +
            } block_q4_K;
         | 
| 154 | 
            +
            static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
         | 
| 155 | 
            +
            #else
         | 
| 126 156 | 
             
            typedef struct {
         | 
| 127 157 | 
             
                half d;                    // super-block scale for quantized scales
         | 
| 128 158 | 
             
                half dmin;                 // super-block scale for quantized mins
         | 
| @@ -130,15 +160,26 @@ typedef struct { | |
| 130 160 | 
             
                uint8_t qs[QK_K/2];        // 4--bit quants
         | 
| 131 161 | 
             
            } block_q4_K;
         | 
| 132 162 | 
             
            static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
         | 
| 163 | 
            +
            #endif
         | 
| 133 164 |  | 
| 165 | 
            +
            #ifdef GGML_QKK_64
         | 
| 166 | 
            +
            typedef struct {
         | 
| 167 | 
            +
                half d;                  // super-block scale
         | 
| 168 | 
            +
                int8_t scales[QK_K/16];  // block scales
         | 
| 169 | 
            +
                uint8_t qh[QK_K/8];      // quants, high bit
         | 
| 170 | 
            +
                uint8_t qs[QK_K/2];      // quants, low 4 bits
         | 
| 171 | 
            +
            } block_q5_K;
         | 
| 172 | 
            +
            static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
         | 
| 173 | 
            +
            #else
         | 
| 134 174 | 
             
            typedef struct {
         | 
| 135 | 
            -
                half | 
| 136 | 
            -
                half | 
| 137 | 
            -
                uint8_t scales[ | 
| 175 | 
            +
                half d;               // super-block scale for quantized scales
         | 
| 176 | 
            +
                half dmin;            // super-block scale for quantized mins
         | 
| 177 | 
            +
                uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
         | 
| 138 178 | 
             
                uint8_t qh[QK_K/8];          // quants, high bit
         | 
| 139 179 | 
             
                uint8_t qs[QK_K/2];          // quants, low 4 bits
         | 
| 140 180 | 
             
            } block_q5_K;
         | 
| 141 | 
            -
            static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) +  | 
| 181 | 
            +
            static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
         | 
| 182 | 
            +
            #endif
         | 
| 142 183 |  | 
| 143 184 | 
             
            typedef struct {
         | 
| 144 185 | 
             
                uint8_t ql[QK_K/2];   // quants, lower 4 bits
         | 
| @@ -167,6 +208,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ | |
| 167 208 | 
             
            #define GGML_CUDA_DMMV_Y 1
         | 
| 168 209 | 
             
            #endif
         | 
| 169 210 |  | 
| 211 | 
            +
            #ifndef K_QUANTS_PER_ITERATION
         | 
| 212 | 
            +
            #define K_QUANTS_PER_ITERATION 2
         | 
| 213 | 
            +
            #else
         | 
| 214 | 
            +
            static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
         | 
| 215 | 
            +
            #endif
         | 
| 216 | 
            +
             | 
| 170 217 | 
             
            static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
         | 
| 171 218 | 
             
                const int i = blockDim.x*blockIdx.x + threadIdx.x;
         | 
| 172 219 |  | 
| @@ -224,82 +271,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol | |
| 224 271 | 
             
                }
         | 
| 225 272 | 
             
            }
         | 
| 226 273 |  | 
| 227 | 
            -
            static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs,  | 
| 274 | 
            +
            static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
         | 
| 228 275 | 
             
                const block_q4_0 * x = (const block_q4_0 *) vx;
         | 
| 229 276 |  | 
| 230 | 
            -
                const  | 
| 277 | 
            +
                const dfloat d = x[ib].d;
         | 
| 231 278 |  | 
| 232 | 
            -
                const  | 
| 279 | 
            +
                const int vui = x[ib].qs[iqs];
         | 
| 233 280 |  | 
| 234 | 
            -
                 | 
| 235 | 
            -
                 | 
| 281 | 
            +
                v.x = vui & 0xF;
         | 
| 282 | 
            +
                v.y = vui >> 4;
         | 
| 236 283 |  | 
| 237 | 
            -
             | 
| 238 | 
            -
                 | 
| 284 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 285 | 
            +
                v = __hsub2(v, {8.0f, 8.0f});
         | 
| 286 | 
            +
                v = __hmul2(v, {d, d});
         | 
| 287 | 
            +
            #else
         | 
| 288 | 
            +
                v.x = (v.x - 8.0f) * d;
         | 
| 289 | 
            +
                v.y = (v.y - 8.0f) * d;
         | 
| 290 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 239 291 | 
             
            }
         | 
| 240 292 |  | 
| 241 | 
            -
            static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs,  | 
| 293 | 
            +
            static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
         | 
| 242 294 | 
             
                const block_q4_1 * x = (const block_q4_1 *) vx;
         | 
| 243 295 |  | 
| 244 | 
            -
                const  | 
| 245 | 
            -
                const  | 
| 296 | 
            +
                const dfloat d = x[ib].d;
         | 
| 297 | 
            +
                const dfloat m = x[ib].m;
         | 
| 246 298 |  | 
| 247 | 
            -
                const  | 
| 299 | 
            +
                const int vui = x[ib].qs[iqs];
         | 
| 248 300 |  | 
| 249 | 
            -
                 | 
| 250 | 
            -
                 | 
| 301 | 
            +
                v.x = vui & 0xF;
         | 
| 302 | 
            +
                v.y = vui >> 4;
         | 
| 251 303 |  | 
| 252 | 
            -
             | 
| 253 | 
            -
                 | 
| 304 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 305 | 
            +
                v = __hmul2(v, {d, d});
         | 
| 306 | 
            +
                v = __hadd2(v, {m, m});
         | 
| 307 | 
            +
            #else
         | 
| 308 | 
            +
                v.x = (v.x * d) + m;
         | 
| 309 | 
            +
                v.y = (v.y * d) + m;
         | 
| 310 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 254 311 | 
             
            }
         | 
| 255 312 |  | 
| 256 | 
            -
            static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs,  | 
| 313 | 
            +
            static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
         | 
| 257 314 | 
             
                const block_q5_0 * x = (const block_q5_0 *) vx;
         | 
| 258 315 |  | 
| 259 | 
            -
                const  | 
| 316 | 
            +
                const dfloat d = x[ib].d;
         | 
| 260 317 |  | 
| 261 318 | 
             
                uint32_t qh;
         | 
| 262 319 | 
             
                memcpy(&qh, x[ib].qh, sizeof(qh));
         | 
| 263 320 |  | 
| 264 | 
            -
                const  | 
| 265 | 
            -
                const  | 
| 321 | 
            +
                const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
         | 
| 322 | 
            +
                const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
         | 
| 266 323 |  | 
| 267 | 
            -
                 | 
| 268 | 
            -
                 | 
| 324 | 
            +
                v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
         | 
| 325 | 
            +
                v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
         | 
| 269 326 |  | 
| 270 | 
            -
             | 
| 271 | 
            -
                 | 
| 327 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 328 | 
            +
                v = __hsub2(v, {16.0f, 16.0f});
         | 
| 329 | 
            +
                v = __hmul2(v, {d, d});
         | 
| 330 | 
            +
            #else
         | 
| 331 | 
            +
                v.x = (v.x - 16.0f) * d;
         | 
| 332 | 
            +
                v.y = (v.y - 16.0f) * d;
         | 
| 333 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 272 334 | 
             
            }
         | 
| 273 335 |  | 
| 274 | 
            -
            static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs,  | 
| 336 | 
            +
            static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
         | 
| 275 337 | 
             
                const block_q5_1 * x = (const block_q5_1 *) vx;
         | 
| 276 338 |  | 
| 277 | 
            -
                const  | 
| 278 | 
            -
                const  | 
| 339 | 
            +
                const dfloat d = x[ib].d;
         | 
| 340 | 
            +
                const dfloat m = x[ib].m;
         | 
| 279 341 |  | 
| 280 342 | 
             
                uint32_t qh;
         | 
| 281 343 | 
             
                memcpy(&qh, x[ib].qh, sizeof(qh));
         | 
| 282 344 |  | 
| 283 | 
            -
                const  | 
| 284 | 
            -
                const  | 
| 345 | 
            +
                const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
         | 
| 346 | 
            +
                const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
         | 
| 285 347 |  | 
| 286 | 
            -
                 | 
| 287 | 
            -
                 | 
| 348 | 
            +
                v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
         | 
| 349 | 
            +
                v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
         | 
| 288 350 |  | 
| 289 | 
            -
             | 
| 290 | 
            -
                 | 
| 351 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 352 | 
            +
                v = __hmul2(v, {d, d});
         | 
| 353 | 
            +
                v = __hadd2(v, {m, m});
         | 
| 354 | 
            +
            #else
         | 
| 355 | 
            +
                v.x = (v.x * d) + m;
         | 
| 356 | 
            +
                v.y = (v.y * d) + m;
         | 
| 357 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 291 358 | 
             
            }
         | 
| 292 359 |  | 
| 293 | 
            -
            static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs,  | 
| 360 | 
            +
            static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
         | 
| 294 361 | 
             
                const block_q8_0 * x = (const block_q8_0 *) vx;
         | 
| 295 362 |  | 
| 296 | 
            -
                const  | 
| 363 | 
            +
                const dfloat d = x[ib].d;
         | 
| 297 364 |  | 
| 298 | 
            -
                 | 
| 299 | 
            -
                 | 
| 365 | 
            +
                v.x = x[ib].qs[iqs + 0];
         | 
| 366 | 
            +
                v.y = x[ib].qs[iqs + 1];
         | 
| 300 367 |  | 
| 301 | 
            -
             | 
| 302 | 
            -
                 | 
| 368 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 369 | 
            +
                v = __hmul2(v, {d, d});
         | 
| 370 | 
            +
            #else
         | 
| 371 | 
            +
                v.x *= d;
         | 
| 372 | 
            +
                v.y *= d;
         | 
| 373 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 303 374 | 
             
            }
         | 
| 304 375 |  | 
| 305 376 | 
             
            //================================== k-quants
         | 
| @@ -307,13 +378,14 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int | |
| 307 378 | 
             
            static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
         | 
| 308 379 |  | 
| 309 380 | 
             
                const int i   = blockIdx.x;
         | 
| 381 | 
            +
                const block_q2_K * x = (const block_q2_K *) vx;
         | 
| 382 | 
            +
             | 
| 310 383 | 
             
                const int tid = threadIdx.x;
         | 
| 384 | 
            +
            #if QK_K == 256
         | 
| 311 385 | 
             
                const int n   = tid/32;
         | 
| 312 386 | 
             
                const int l   = tid - 32*n;
         | 
| 313 387 | 
             
                const int is  = 8*n + l/16;
         | 
| 314 388 |  | 
| 315 | 
            -
                const block_q2_K * x = (const block_q2_K *) vx;
         | 
| 316 | 
            -
             | 
| 317 389 | 
             
                const uint8_t q = x[i].qs[32*n + l];
         | 
| 318 390 | 
             
                float * y = yy + i*QK_K + 128*n;
         | 
| 319 391 |  | 
| @@ -323,52 +395,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { | |
| 323 395 | 
             
                y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
         | 
| 324 396 | 
             
                y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
         | 
| 325 397 | 
             
                y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
         | 
| 326 | 
            -
             | 
| 327 | 
            -
             | 
| 328 | 
            -
             | 
| 329 | 
            -
             | 
| 330 | 
            -
             | 
| 331 | 
            -
                 | 
| 332 | 
            -
             | 
| 333 | 
            -
                 | 
| 334 | 
            -
                 | 
| 335 | 
            -
             | 
| 336 | 
            -
                int n = iqs/128;                // 0 or 1
         | 
| 337 | 
            -
                int r = iqs - 128*n;            // 0...120 in steps of 8
         | 
| 338 | 
            -
                int l = r/8;                    // 0...15 in steps of 1
         | 
| 339 | 
            -
             | 
| 340 | 
            -
                const float   * y = yy + 128*n + l;
         | 
| 341 | 
            -
                const uint8_t * q = x[ib].qs + 32*n + l;
         | 
| 342 | 
            -
                const uint8_t * s = x[ib].scales + 8*n;
         | 
| 343 | 
            -
             | 
| 344 | 
            -
                const float dall = x[ib].d;
         | 
| 345 | 
            -
                const float dmin = x[ib].dmin;
         | 
| 346 | 
            -
             | 
| 347 | 
            -
                float sum = y[  0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
         | 
| 348 | 
            -
                          + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
         | 
| 349 | 
            -
                          + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
         | 
| 350 | 
            -
                          + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
         | 
| 351 | 
            -
                          + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
         | 
| 352 | 
            -
                          + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
         | 
| 353 | 
            -
                          + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
         | 
| 354 | 
            -
                          + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
         | 
| 355 | 
            -
             | 
| 356 | 
            -
                result = sum;
         | 
| 398 | 
            +
            #else
         | 
| 399 | 
            +
                const int is = tid/16;  // 0 or 1
         | 
| 400 | 
            +
                const int il = tid%16;  // 0...15
         | 
| 401 | 
            +
                const uint8_t q = x[i].qs[il] >> (2*is);
         | 
| 402 | 
            +
                float * y = yy + i*QK_K + 16*is + il;
         | 
| 403 | 
            +
                float dall = x[i].d;
         | 
| 404 | 
            +
                float dmin = x[i].dmin;
         | 
| 405 | 
            +
                y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
         | 
| 406 | 
            +
                y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
         | 
| 407 | 
            +
            #endif
         | 
| 357 408 |  | 
| 358 409 | 
             
            }
         | 
| 359 410 |  | 
| 360 411 | 
             
            static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
         | 
| 361 412 |  | 
| 362 | 
            -
                int  | 
| 363 | 
            -
                int i = blockIdx.x;
         | 
| 364 | 
            -
                int tid = r/2;
         | 
| 365 | 
            -
                int is0 = r%2;
         | 
| 366 | 
            -
                int l0 = 16*is0 + 4*(threadIdx.x%4);
         | 
| 367 | 
            -
                int n = tid / 4;
         | 
| 368 | 
            -
                int j = tid - 4*n;
         | 
| 369 | 
            -
             | 
| 413 | 
            +
                const int i = blockIdx.x;
         | 
| 370 414 | 
             
                const block_q3_K * x = (const block_q3_K *) vx;
         | 
| 371 415 |  | 
| 416 | 
            +
            #if QK_K == 256
         | 
| 417 | 
            +
                const int r = threadIdx.x/4;
         | 
| 418 | 
            +
                const int tid = r/2;
         | 
| 419 | 
            +
                const int is0 = r%2;
         | 
| 420 | 
            +
                const int l0 = 16*is0 + 4*(threadIdx.x%4);
         | 
| 421 | 
            +
                const int n = tid / 4;
         | 
| 422 | 
            +
                const int j = tid - 4*n;
         | 
| 423 | 
            +
             | 
| 372 424 | 
             
                uint8_t m = 1 << (4*n + j);
         | 
| 373 425 | 
             
                int is = 8*n + 2*j + is0;
         | 
| 374 426 | 
             
                int shift = 2*j;
         | 
| @@ -385,54 +437,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { | |
| 385 437 | 
             
                const uint8_t * hm = x[i].hmask;
         | 
| 386 438 |  | 
| 387 439 | 
             
                for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
         | 
| 440 | 
            +
            #else
         | 
| 441 | 
            +
                const int tid = threadIdx.x;
         | 
| 442 | 
            +
                const int is  = tid/16;  // 0 or 1
         | 
| 443 | 
            +
                const int il  = tid%16;  // 0...15
         | 
| 444 | 
            +
                const int im  = il/8;    // 0...1
         | 
| 445 | 
            +
                const int in  = il%8;    // 0...7
         | 
| 388 446 |  | 
| 389 | 
            -
             | 
| 390 | 
            -
             | 
| 391 | 
            -
            static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
         | 
| 392 | 
            -
             | 
| 393 | 
            -
                const block_q3_K * x = (const block_q3_K *) vx;
         | 
| 394 | 
            -
             | 
| 395 | 
            -
                const uint32_t kmask1 = 0x03030303;
         | 
| 396 | 
            -
                const uint32_t kmask2 = 0x0f0f0f0f;
         | 
| 397 | 
            -
             | 
| 398 | 
            -
                uint32_t aux[3];
         | 
| 399 | 
            -
                uint32_t utmp[4];
         | 
| 400 | 
            -
             | 
| 401 | 
            -
                // if n is 0, we want to do the lower 128, else the upper 128,
         | 
| 402 | 
            -
                // covering y[l+0],  y[l+32], y[l+64], y[l+96] and
         | 
| 403 | 
            -
                //          y[l+16], y[l+48], y[l+80], y[l+112]
         | 
| 404 | 
            -
                int n = iqs/128;                // 0 or 1
         | 
| 405 | 
            -
                int r = iqs - 128*n;            // 0...120 in steps of 8
         | 
| 406 | 
            -
                int l = r/8;                    // 0...15 in steps of 1
         | 
| 407 | 
            -
             | 
| 408 | 
            -
                const float   * y = yy + 128*n + l;
         | 
| 409 | 
            -
                const uint8_t * q = x[ib].qs + 32*n + l;
         | 
| 410 | 
            -
                const uint8_t * hm = x[ib].hmask + l;
         | 
| 411 | 
            -
                const int8_t  * s = (const int8_t *)utmp + 8*n;
         | 
| 412 | 
            -
             | 
| 413 | 
            -
                memcpy(aux, x[ib].scales, 12);
         | 
| 414 | 
            -
                utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
         | 
| 415 | 
            -
                utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
         | 
| 416 | 
            -
                utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
         | 
| 417 | 
            -
                utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
         | 
| 418 | 
            -
             | 
| 419 | 
            -
                const float dall = x[ib].d;
         | 
| 420 | 
            -
             | 
| 421 | 
            -
                const uint8_t m = 1 << (4*n);
         | 
| 447 | 
            +
                float * y = yy + i*QK_K + 16*is + il;
         | 
| 422 448 |  | 
| 423 | 
            -
                 | 
| 424 | 
            -
             | 
| 425 | 
            -
             | 
| 426 | 
            -
                          + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
         | 
| 427 | 
            -
                          + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
         | 
| 428 | 
            -
                          + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
         | 
| 429 | 
            -
                          + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
         | 
| 430 | 
            -
                          + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
         | 
| 449 | 
            +
                const uint8_t q = x[i].qs[il] >> (2*is);
         | 
| 450 | 
            +
                const uint8_t h = x[i].hmask[in] >> (2*is + im);
         | 
| 451 | 
            +
                const float   d = (float)x[i].d;
         | 
| 431 452 |  | 
| 432 | 
            -
                 | 
| 453 | 
            +
                if (is == 0) {
         | 
| 454 | 
            +
                    y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
         | 
| 455 | 
            +
                    y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
         | 
| 456 | 
            +
                } else {
         | 
| 457 | 
            +
                    y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
         | 
| 458 | 
            +
                    y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
         | 
| 459 | 
            +
                }
         | 
| 460 | 
            +
            #endif
         | 
| 433 461 |  | 
| 434 462 | 
             
            }
         | 
| 435 463 |  | 
| 464 | 
            +
            #if QK_K == 256
         | 
| 436 465 | 
             
            static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
         | 
| 437 466 | 
             
                if (j < 4) {
         | 
| 438 467 | 
             
                    d = q[j] & 63; m = q[j + 4] & 63;
         | 
| @@ -441,19 +470,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t | |
| 441 470 | 
             
                    m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
         | 
| 442 471 | 
             
                }
         | 
| 443 472 | 
             
            }
         | 
| 473 | 
            +
            #endif
         | 
| 444 474 |  | 
| 445 475 | 
             
            static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
         | 
| 446 476 | 
             
                const block_q4_K * x = (const block_q4_K *) vx;
         | 
| 447 477 |  | 
| 448 478 | 
             
                const int i = blockIdx.x;
         | 
| 449 479 |  | 
| 450 | 
            -
             | 
| 451 | 
            -
                //const int tid = threadIdx.x;
         | 
| 452 | 
            -
                //const int il  = tid/16;
         | 
| 453 | 
            -
                //const int ir  = tid%16;
         | 
| 454 | 
            -
                //const int is  = 2*il;
         | 
| 455 | 
            -
                //const int n   = 2;
         | 
| 456 | 
            -
             | 
| 480 | 
            +
            #if QK_K == 256
         | 
| 457 481 | 
             
                // assume 32 threads
         | 
| 458 482 | 
             
                const int tid = threadIdx.x;
         | 
| 459 483 | 
             
                const int il  = tid/8;
         | 
| @@ -477,38 +501,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { | |
| 477 501 | 
             
                    y[l + 0] = d1 * (q[l] & 0xF) - m1;
         | 
| 478 502 | 
             
                    y[l +32] = d2 * (q[l] >>  4) - m2;
         | 
| 479 503 | 
             
                }
         | 
| 480 | 
            -
             | 
| 481 | 
            -
             | 
| 482 | 
            -
             | 
| 483 | 
            -
             | 
| 484 | 
            -
                const  | 
| 485 | 
            -
             | 
| 486 | 
            -
             | 
| 487 | 
            -
                 | 
| 488 | 
            -
             | 
| 489 | 
            -
                const int is = 2*j;             // is is in 0...6 in steps of 2
         | 
| 490 | 
            -
             | 
| 491 | 
            -
                const float   * y = yy + 64*j + ir;
         | 
| 492 | 
            -
                const uint8_t * q = x[ib].qs + 32*j + ir;
         | 
| 493 | 
            -
             | 
| 494 | 
            -
                const float dall = x[ib].d;
         | 
| 495 | 
            -
                const float dmin = x[ib].dmin;
         | 
| 496 | 
            -
             | 
| 497 | 
            -
                uint8_t sc, m;
         | 
| 498 | 
            -
                get_scale_min_k4(is + 0, x[ib].scales, sc, m);
         | 
| 499 | 
            -
                const float d1 = dall * sc;
         | 
| 500 | 
            -
                const float m1 = dmin * m;
         | 
| 501 | 
            -
                get_scale_min_k4(is + 1, x[ib].scales, sc, m);
         | 
| 502 | 
            -
                const float d2 = dall * sc;
         | 
| 503 | 
            -
                const float m2 = dmin * m;
         | 
| 504 | 
            -
             | 
| 505 | 
            -
                float sum = 0;
         | 
| 506 | 
            -
                for (int k = 0; k < 4; ++k) {
         | 
| 507 | 
            -
                    sum += y[k +  0] * (d1 * (q[k] & 0xF) - m1);
         | 
| 508 | 
            -
                    sum += y[k + 32] * (d2 * (q[k] >>  4) - m2);
         | 
| 509 | 
            -
                }
         | 
| 510 | 
            -
                result = sum;
         | 
| 511 | 
            -
             | 
| 504 | 
            +
            #else
         | 
| 505 | 
            +
                const int tid = threadIdx.x;
         | 
| 506 | 
            +
                const uint8_t * q = x[i].qs;
         | 
| 507 | 
            +
                float * y = yy + i*QK_K;
         | 
| 508 | 
            +
                const float d = (float)x[i].d[0];
         | 
| 509 | 
            +
                const float m = (float)x[i].d[1];
         | 
| 510 | 
            +
                y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
         | 
| 511 | 
            +
                y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
         | 
| 512 | 
            +
            #endif
         | 
| 512 513 | 
             
            }
         | 
| 513 514 |  | 
| 514 515 | 
             
            static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
         | 
| @@ -516,6 +517,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { | |
| 516 517 |  | 
| 517 518 | 
             
                const int i = blockIdx.x;
         | 
| 518 519 |  | 
| 520 | 
            +
            #if QK_K == 256
         | 
| 519 521 | 
             
                // assume 64 threads - this is very slightly better than the one below
         | 
| 520 522 | 
             
                const int tid = threadIdx.x;
         | 
| 521 523 | 
             
                const int il  = tid/16;   // il is in 0...3
         | 
| @@ -542,49 +544,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { | |
| 542 544 | 
             
                hm <<= 1;
         | 
| 543 545 | 
             
                y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
         | 
| 544 546 | 
             
                y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
         | 
| 545 | 
            -
             | 
| 546 | 
            -
             | 
| 547 | 
            -
             | 
| 548 | 
            -
             | 
| 549 | 
            -
                const  | 
| 550 | 
            -
             | 
| 551 | 
            -
             | 
| 552 | 
            -
                const  | 
| 553 | 
            -
                 | 
| 554 | 
            -
                 | 
| 555 | 
            -
             | 
| 556 | 
            -
             | 
| 557 | 
            -
                const uint8_t * ql = x[ib].qs + 32*j + ir;
         | 
| 558 | 
            -
                const uint8_t * qh = x[ib].qh + ir;
         | 
| 559 | 
            -
             | 
| 560 | 
            -
                const float dall = x[ib].d;
         | 
| 561 | 
            -
                const float dmin = x[ib].dmin;
         | 
| 562 | 
            -
             | 
| 563 | 
            -
                uint8_t sc, m;
         | 
| 564 | 
            -
                get_scale_min_k4(is + 0, x[ib].scales, sc, m);
         | 
| 565 | 
            -
                const float d1 = dall * sc;
         | 
| 566 | 
            -
                const float m1 = dmin * m;
         | 
| 567 | 
            -
                get_scale_min_k4(is + 1, x[ib].scales, sc, m);
         | 
| 568 | 
            -
                const float d2 = dall * sc;
         | 
| 569 | 
            -
                const float m2 = dmin * m;
         | 
| 570 | 
            -
             | 
| 571 | 
            -
                uint8_t   hm  = 1 << is;
         | 
| 572 | 
            -
                float sum = 0;
         | 
| 573 | 
            -
                for (int k = 0; k < 4; ++k) {
         | 
| 574 | 
            -
                    sum += y[k +  0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
         | 
| 575 | 
            -
                }
         | 
| 576 | 
            -
                hm <<= 1;
         | 
| 577 | 
            -
                for (int k = 0; k < 4; ++k) {
         | 
| 578 | 
            -
                    sum += y[k + 32] * (d2 * ((ql[k] >>  4) + (qh[k] & hm ? 16 : 0)) - m2);
         | 
| 579 | 
            -
                }
         | 
| 580 | 
            -
                result = sum;
         | 
| 581 | 
            -
             | 
| 547 | 
            +
            #else
         | 
| 548 | 
            +
                const int tid = threadIdx.x;
         | 
| 549 | 
            +
                const uint8_t q = x[i].qs[tid];
         | 
| 550 | 
            +
                const int im = tid/8;  // 0...3
         | 
| 551 | 
            +
                const int in = tid%8;  // 0...7
         | 
| 552 | 
            +
                const int is = tid/16; // 0 or 1
         | 
| 553 | 
            +
                const uint8_t h = x[i].qh[in] >> im;
         | 
| 554 | 
            +
                const float d = x[i].d;
         | 
| 555 | 
            +
                float * y = yy + i*QK_K + tid;
         | 
| 556 | 
            +
                y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
         | 
| 557 | 
            +
                y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
         | 
| 558 | 
            +
            #endif
         | 
| 582 559 | 
             
            }
         | 
| 583 560 |  | 
| 584 561 | 
             
            static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
         | 
| 585 562 | 
             
                const block_q6_K * x = (const block_q6_K *) vx;
         | 
| 586 563 |  | 
| 587 564 | 
             
                const int i = blockIdx.x;
         | 
| 565 | 
            +
            #if QK_K == 256
         | 
| 588 566 |  | 
| 589 567 | 
             
                // assume 64 threads - this is very slightly better than the one below
         | 
| 590 568 | 
             
                const int tid = threadIdx.x;
         | 
| @@ -604,40 +582,566 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { | |
| 604 582 | 
             
                y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
         | 
| 605 583 | 
             
                y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
         | 
| 606 584 | 
             
                y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
         | 
| 585 | 
            +
            #else
         | 
| 586 | 
            +
             | 
| 587 | 
            +
                // assume 32 threads
         | 
| 588 | 
            +
                const int tid = threadIdx.x;
         | 
| 589 | 
            +
                const int ip  = tid/16;         // 0 or 1
         | 
| 590 | 
            +
                const int il  = tid - 16*ip;    // 0...15
         | 
| 591 | 
            +
             | 
| 592 | 
            +
                float * y = yy + i*QK_K + 16*ip + il;
         | 
| 593 | 
            +
             | 
| 594 | 
            +
                const float d = x[i].d;
         | 
| 595 | 
            +
             | 
| 596 | 
            +
                const uint8_t   ql = x[i].ql[16*ip + il];
         | 
| 597 | 
            +
                const uint8_t   qh = x[i].qh[il] >> (2*ip);
         | 
| 598 | 
            +
                const int8_t  * sc = x[i].scales;
         | 
| 599 | 
            +
             | 
| 600 | 
            +
                y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
         | 
| 601 | 
            +
                y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
         | 
| 602 | 
            +
            #endif
         | 
| 607 603 | 
             
            }
         | 
| 608 604 |  | 
| 609 | 
            -
            static  | 
| 605 | 
            +
            static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
         | 
| 610 606 |  | 
| 611 | 
            -
                 | 
| 607 | 
            +
                static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
         | 
| 608 | 
            +
             | 
| 609 | 
            +
                const int row = blockIdx.y*blockDim.y + threadIdx.y;
         | 
| 610 | 
            +
                if (row > nrows) return;
         | 
| 611 | 
            +
             | 
| 612 | 
            +
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 613 | 
            +
                const int ib0 = row*num_blocks_per_row;
         | 
| 614 | 
            +
             | 
| 615 | 
            +
                const block_q2_K * x = (const block_q2_K *)vx + ib0;
         | 
| 616 | 
            +
             | 
| 617 | 
            +
                float tmp = 0; // partial sum for thread in warp
         | 
| 618 | 
            +
             | 
| 619 | 
            +
            #if QK_K == 256
         | 
| 620 | 
            +
                const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
         | 
| 621 | 
            +
                const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
         | 
| 622 | 
            +
             | 
| 623 | 
            +
                const int step = 16/K_QUANTS_PER_ITERATION;
         | 
| 624 | 
            +
             | 
| 625 | 
            +
                const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
         | 
| 626 | 
            +
                const int in = tid - step*im;                        // 0...15 or 0...7
         | 
| 627 | 
            +
             | 
| 628 | 
            +
                const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
         | 
| 629 | 
            +
                const int q_offset = 32*im + l0;
         | 
| 630 | 
            +
                const int s_offset = 8*im;
         | 
| 631 | 
            +
                const int y_offset = 128*im + l0;
         | 
| 632 | 
            +
             | 
| 633 | 
            +
                uint32_t aux[4];
         | 
| 634 | 
            +
                const uint8_t * d = (const uint8_t *)aux;
         | 
| 635 | 
            +
                const uint8_t * m = (const uint8_t *)(aux + 2);
         | 
| 636 | 
            +
             | 
| 637 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
         | 
| 638 | 
            +
             | 
| 639 | 
            +
                    const float   * y = yy + i * QK_K + y_offset;
         | 
| 640 | 
            +
                    const uint8_t * q = x[i].qs + q_offset;
         | 
| 641 | 
            +
             | 
| 642 | 
            +
                    const float dall = x[i].d;
         | 
| 643 | 
            +
                    const float dmin = x[i].dmin;
         | 
| 644 | 
            +
             | 
| 645 | 
            +
                    const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
         | 
| 646 | 
            +
                    aux[0] = a[0] & 0x0f0f0f0f;
         | 
| 647 | 
            +
                    aux[1] = a[1] & 0x0f0f0f0f;
         | 
| 648 | 
            +
                    aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
         | 
| 649 | 
            +
                    aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
         | 
| 650 | 
            +
             | 
| 651 | 
            +
                    float sum1 = 0, sum2 = 0;
         | 
| 652 | 
            +
                    for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
         | 
| 653 | 
            +
                        sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
         | 
| 654 | 
            +
                              + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
         | 
| 655 | 
            +
                              + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
         | 
| 656 | 
            +
                              + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
         | 
| 657 | 
            +
                              + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
         | 
| 658 | 
            +
                              + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
         | 
| 659 | 
            +
                              + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
         | 
| 660 | 
            +
                              +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
         | 
| 661 | 
            +
                        sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
         | 
| 662 | 
            +
                              + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
         | 
| 663 | 
            +
             | 
| 664 | 
            +
                    }
         | 
| 665 | 
            +
                    tmp += dall * sum1 - dmin * sum2;
         | 
| 666 | 
            +
             | 
| 667 | 
            +
                }
         | 
| 668 | 
            +
            #else
         | 
| 669 | 
            +
                const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
         | 
| 670 | 
            +
                const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
         | 
| 671 | 
            +
                const int offset = tid * K_QUANTS_PER_ITERATION;
         | 
| 672 | 
            +
             | 
| 673 | 
            +
                uint32_t uaux[2];
         | 
| 674 | 
            +
                const uint8_t * d = (const uint8_t *)uaux;
         | 
| 675 | 
            +
             | 
| 676 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
         | 
| 677 | 
            +
             | 
| 678 | 
            +
                    const float   * y = yy + i * QK_K + offset;
         | 
| 679 | 
            +
                    const uint8_t * q = x[i].qs + offset;
         | 
| 680 | 
            +
                    const uint32_t * s = (const uint32_t *)x[i].scales;
         | 
| 681 | 
            +
             | 
| 682 | 
            +
                    uaux[0] = s[0] & 0x0f0f0f0f;
         | 
| 683 | 
            +
                    uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
         | 
| 684 | 
            +
             | 
| 685 | 
            +
                    const half2 * dh = (const half2 *)&x[i].d;
         | 
| 686 | 
            +
             | 
| 687 | 
            +
                    const float2 dall = __half22float2(dh[0]);
         | 
| 688 | 
            +
             | 
| 689 | 
            +
                    float sum1 = 0, sum2 = 0;
         | 
| 690 | 
            +
                    for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
         | 
| 691 | 
            +
                        const uint8_t ql = q[l];
         | 
| 692 | 
            +
                        sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
         | 
| 693 | 
            +
                              + y[l+16] * d[1] * ((ql >> 2) & 3)
         | 
| 694 | 
            +
                              + y[l+32] * d[2] * ((ql >> 4) & 3)
         | 
| 695 | 
            +
                              + y[l+48] * d[3] * ((ql >> 6) & 3);
         | 
| 696 | 
            +
                        sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
         | 
| 697 | 
            +
                    }
         | 
| 698 | 
            +
                    tmp += dall.x * sum1 - dall.y * sum2;
         | 
| 699 | 
            +
                }
         | 
| 700 | 
            +
            #endif
         | 
| 701 | 
            +
             | 
| 702 | 
            +
                // sum up partial sums and write back result
         | 
| 703 | 
            +
                __syncthreads();
         | 
| 704 | 
            +
            #pragma unroll
         | 
| 705 | 
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         | 
| 706 | 
            +
                    tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
         | 
| 707 | 
            +
                }
         | 
| 708 | 
            +
             | 
| 709 | 
            +
                if (threadIdx.x == 0) {
         | 
| 710 | 
            +
                    dst[row] = tmp;
         | 
| 711 | 
            +
                }
         | 
| 712 | 
            +
            }
         | 
| 713 | 
            +
             | 
| 714 | 
            +
            static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
         | 
| 715 | 
            +
             | 
| 716 | 
            +
                const int row = blockIdx.y*blockDim.y + threadIdx.y;
         | 
| 717 | 
            +
                if (row > nrows) return;
         | 
| 718 | 
            +
             | 
| 719 | 
            +
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 720 | 
            +
                const int ib0 = row*num_blocks_per_row;
         | 
| 721 | 
            +
             | 
| 722 | 
            +
                const block_q3_K * x = (const block_q3_K *)vx + ib0;
         | 
| 723 | 
            +
             | 
| 724 | 
            +
                float tmp = 0; // partial sum for thread in warp
         | 
| 725 | 
            +
             | 
| 726 | 
            +
            #if QK_K == 256
         | 
| 727 | 
            +
             | 
| 728 | 
            +
                const uint16_t kmask1 = 0x0303;
         | 
| 729 | 
            +
                const uint16_t kmask2 = 0x0f0f;
         | 
| 730 | 
            +
             | 
| 731 | 
            +
                const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
         | 
| 732 | 
            +
                const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
         | 
| 733 | 
            +
             | 
| 734 | 
            +
                const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
         | 
| 735 | 
            +
                const int step = 16/K_QUANTS_PER_ITERATION;
         | 
| 736 | 
            +
                const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
         | 
| 737 | 
            +
                const int in = tid - step*im;                        // 0....15 or 0...7
         | 
| 738 | 
            +
             | 
| 739 | 
            +
                const uint8_t m = 1 << (4*im);
         | 
| 740 | 
            +
             | 
| 741 | 
            +
                const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
         | 
| 742 | 
            +
                const int q_offset =  32*im + l0;
         | 
| 743 | 
            +
                const int y_offset = 128*im + l0;
         | 
| 744 | 
            +
             | 
| 745 | 
            +
                uint16_t utmp[4];
         | 
| 746 | 
            +
                const int8_t * s = (const int8_t *)utmp;
         | 
| 747 | 
            +
             | 
| 748 | 
            +
                const uint16_t s_shift = 4*im;
         | 
| 749 | 
            +
             | 
| 750 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
         | 
| 751 | 
            +
             | 
| 752 | 
            +
                    const float   * y  = yy + i * QK_K + y_offset;
         | 
| 753 | 
            +
                    const uint8_t * q = x[i].qs + q_offset;
         | 
| 754 | 
            +
                    const uint8_t * h = x[i].hmask + l0;
         | 
| 755 | 
            +
             | 
| 756 | 
            +
                    const uint16_t * a = (const uint16_t *)x[i].scales;
         | 
| 757 | 
            +
                    utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
         | 
| 758 | 
            +
                    utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
         | 
| 759 | 
            +
                    utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
         | 
| 760 | 
            +
                    utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
         | 
| 761 | 
            +
             | 
| 762 | 
            +
                    const float d = x[i].d;
         | 
| 763 | 
            +
             | 
| 764 | 
            +
                    float sum = 0;
         | 
| 765 | 
            +
                    for (int l = 0; l < n; ++l) {
         | 
| 766 | 
            +
                        sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
         | 
| 767 | 
            +
                             + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
         | 
| 768 | 
            +
                             + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
         | 
| 769 | 
            +
                             + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
         | 
| 770 | 
            +
                        sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
         | 
| 771 | 
            +
                             + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
         | 
| 772 | 
            +
                             + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
         | 
| 773 | 
            +
                            + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
         | 
| 774 | 
            +
                    }
         | 
| 775 | 
            +
                    tmp += d * sum;
         | 
| 776 | 
            +
             | 
| 777 | 
            +
                }
         | 
| 778 | 
            +
            #else
         | 
| 779 | 
            +
             | 
| 780 | 
            +
                const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
         | 
| 781 | 
            +
                const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
         | 
| 782 | 
            +
                const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
         | 
| 783 | 
            +
                const int in = offset/8;                                 // 0 or 1
         | 
| 784 | 
            +
                const int im = offset%8;                                 // 0...7
         | 
| 785 | 
            +
             | 
| 786 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
         | 
| 787 | 
            +
             | 
| 788 | 
            +
                    const float   * y = yy + i * QK_K + offset;
         | 
| 789 | 
            +
                    const uint8_t * q = x[i].qs + offset;
         | 
| 790 | 
            +
                    const uint8_t * s = x[i].scales;
         | 
| 791 | 
            +
             | 
| 792 | 
            +
                    const float dall = (float)x[i].d;
         | 
| 793 | 
            +
             | 
| 794 | 
            +
                    float sum = 0;
         | 
| 795 | 
            +
                    for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
         | 
| 796 | 
            +
                        const uint8_t hl = x[i].hmask[im+l] >> in;
         | 
| 797 | 
            +
                        const uint8_t ql = q[l];
         | 
| 798 | 
            +
                        sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
         | 
| 799 | 
            +
                             + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
         | 
| 800 | 
            +
                             + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
         | 
| 801 | 
            +
                             + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
         | 
| 802 | 
            +
                    }
         | 
| 803 | 
            +
                    tmp += sum;
         | 
| 804 | 
            +
                }
         | 
| 805 | 
            +
            #endif
         | 
| 806 | 
            +
             | 
| 807 | 
            +
                // sum up partial sums and write back result
         | 
| 808 | 
            +
                __syncthreads();
         | 
| 809 | 
            +
            #pragma unroll
         | 
| 810 | 
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         | 
| 811 | 
            +
                    tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
         | 
| 812 | 
            +
                }
         | 
| 813 | 
            +
             | 
| 814 | 
            +
                if (threadIdx.x == 0) {
         | 
| 815 | 
            +
                    dst[row] = tmp;
         | 
| 816 | 
            +
                }
         | 
| 817 | 
            +
            }
         | 
| 818 | 
            +
             | 
| 819 | 
            +
            static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
         | 
| 820 | 
            +
             | 
| 821 | 
            +
                const int row = blockIdx.y*blockDim.y + threadIdx.y;
         | 
| 822 | 
            +
                if (row > nrows) return;
         | 
| 823 | 
            +
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 824 | 
            +
                const int ib0 = row*num_blocks_per_row;
         | 
| 825 | 
            +
             | 
| 826 | 
            +
                const block_q4_K * x = (const block_q4_K *)vx + ib0;
         | 
| 827 | 
            +
             | 
| 828 | 
            +
            #if QK_K == 256
         | 
| 829 | 
            +
                const uint16_t kmask1 = 0x3f3f;
         | 
| 830 | 
            +
                const uint16_t kmask2 = 0x0f0f;
         | 
| 831 | 
            +
                const uint16_t kmask3 = 0xc0c0;
         | 
| 832 | 
            +
             | 
| 833 | 
            +
                const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
         | 
| 834 | 
            +
                const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
         | 
| 835 | 
            +
             | 
| 836 | 
            +
                const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
         | 
| 837 | 
            +
             | 
| 838 | 
            +
                const int il  = tid/step;                            // 0...3
         | 
| 839 | 
            +
                const int ir  = tid - step*il;                       // 0...7 or 0...3
         | 
| 840 | 
            +
                const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
         | 
| 841 | 
            +
             | 
| 842 | 
            +
                const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
         | 
| 843 | 
            +
                const int in = il%2;
         | 
| 844 | 
            +
             | 
| 845 | 
            +
                const int l0 = n*(2*ir + in);
         | 
| 846 | 
            +
                const int q_offset = 32*im + l0;
         | 
| 847 | 
            +
                const int y_offset = 64*im + l0;
         | 
| 848 | 
            +
             | 
| 849 | 
            +
                uint16_t aux[4];
         | 
| 850 | 
            +
                const uint8_t * sc = (const uint8_t *)aux;
         | 
| 851 | 
            +
             | 
| 852 | 
            +
                float tmp = 0; // partial sum for thread in warp
         | 
| 612 853 |  | 
| 613 | 
            -
                 | 
| 614 | 
            -
                const int il = (iqs - 128*ip)/8; // 0...15
         | 
| 615 | 
            -
                const int is = 8*ip;
         | 
| 854 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
         | 
| 616 855 |  | 
| 617 | 
            -
             | 
| 856 | 
            +
                    const uint8_t * q1 = x[i].qs + q_offset;
         | 
| 857 | 
            +
                    const uint8_t * q2 = q1 + 64;
         | 
| 858 | 
            +
                    const float   * y1 = yy + i*QK_K + y_offset;
         | 
| 859 | 
            +
                    const float   * y2 = y1 + 128;
         | 
| 618 860 |  | 
| 619 | 
            -
             | 
| 861 | 
            +
                    const float dall = x[i].d;
         | 
| 862 | 
            +
                    const float dmin = x[i].dmin;
         | 
| 620 863 |  | 
| 621 | 
            -
             | 
| 622 | 
            -
             | 
| 623 | 
            -
             | 
| 864 | 
            +
                    const uint16_t * a = (const uint16_t *)x[i].scales;
         | 
| 865 | 
            +
                    aux[0] = a[im+0] & kmask1;
         | 
| 866 | 
            +
                    aux[1] = a[im+2] & kmask1;
         | 
| 867 | 
            +
                    aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
         | 
| 868 | 
            +
                    aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
         | 
| 624 869 |  | 
| 625 | 
            -
             | 
| 626 | 
            -
             | 
| 627 | 
            -
             | 
| 628 | 
            -
             | 
| 629 | 
            -
             | 
| 630 | 
            -
             | 
| 631 | 
            -
             | 
| 632 | 
            -
             | 
| 870 | 
            +
                    float4 s = {0.f, 0.f, 0.f, 0.f};
         | 
| 871 | 
            +
                    float smin = 0;
         | 
| 872 | 
            +
                    for (int l = 0; l < n; ++l) {
         | 
| 873 | 
            +
                        s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
         | 
| 874 | 
            +
                        s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
         | 
| 875 | 
            +
                        smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
         | 
| 876 | 
            +
                    }
         | 
| 877 | 
            +
                    tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
         | 
| 878 | 
            +
             | 
| 879 | 
            +
                }
         | 
| 880 | 
            +
            #else
         | 
| 881 | 
            +
                const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
         | 
| 882 | 
            +
                const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
         | 
| 883 | 
            +
             | 
| 884 | 
            +
                const int step = tid * K_QUANTS_PER_ITERATION;
         | 
| 885 | 
            +
             | 
| 886 | 
            +
                uint16_t aux16[2];
         | 
| 887 | 
            +
                const uint8_t * s = (const uint8_t *)aux16;
         | 
| 888 | 
            +
             | 
| 889 | 
            +
                float tmp = 0;
         | 
| 890 | 
            +
             | 
| 891 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
         | 
| 892 | 
            +
                    const uint8_t * q = x[i].qs + step;
         | 
| 893 | 
            +
                    const float   * y = yy + i*QK_K + step;
         | 
| 894 | 
            +
                    const uint16_t * a = (const uint16_t *)x[i].scales;
         | 
| 895 | 
            +
                    aux16[0] = a[0] & 0x0f0f;
         | 
| 896 | 
            +
                    aux16[1] = (a[0] >> 4) & 0x0f0f;
         | 
| 897 | 
            +
                    const float d = (float)x[i].d[0];
         | 
| 898 | 
            +
                    const float m = (float)x[i].d[1];
         | 
| 899 | 
            +
                    float sum = 0.f;
         | 
| 900 | 
            +
                    for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
         | 
| 901 | 
            +
                        sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
         | 
| 902 | 
            +
                             + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
         | 
| 903 | 
            +
                             + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
         | 
| 904 | 
            +
                             + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
         | 
| 905 | 
            +
                    }
         | 
| 906 | 
            +
                    tmp += sum;
         | 
| 907 | 
            +
                }
         | 
| 908 | 
            +
             | 
| 909 | 
            +
            #endif
         | 
| 910 | 
            +
             | 
| 911 | 
            +
                // sum up partial sums and write back result
         | 
| 912 | 
            +
                __syncthreads();
         | 
| 913 | 
            +
            #pragma unroll
         | 
| 914 | 
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         | 
| 915 | 
            +
                    tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
         | 
| 916 | 
            +
                }
         | 
| 633 917 |  | 
| 918 | 
            +
                if (tid == 0) {
         | 
| 919 | 
            +
                    dst[row] = tmp;
         | 
| 920 | 
            +
                }
         | 
| 634 921 | 
             
            }
         | 
| 635 922 |  | 
| 636 | 
            -
            static  | 
| 923 | 
            +
            static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
         | 
| 924 | 
            +
             | 
| 925 | 
            +
                const int row = blockIdx.x;
         | 
| 926 | 
            +
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 927 | 
            +
                const int ib0 = row*num_blocks_per_row;
         | 
| 928 | 
            +
             | 
| 929 | 
            +
                const block_q5_K * x = (const block_q5_K *)vx + ib0;
         | 
| 930 | 
            +
             | 
| 931 | 
            +
                float tmp = 0; // partial sum for thread in warp
         | 
| 932 | 
            +
             | 
| 933 | 
            +
            #if QK_K == 256
         | 
| 934 | 
            +
                const uint16_t kmask1 = 0x3f3f;
         | 
| 935 | 
            +
                const uint16_t kmask2 = 0x0f0f;
         | 
| 936 | 
            +
                const uint16_t kmask3 = 0xc0c0;
         | 
| 937 | 
            +
             | 
| 938 | 
            +
                const int tid = threadIdx.x/2;  // 0...15
         | 
| 939 | 
            +
                const int ix  = threadIdx.x%2;
         | 
| 940 | 
            +
             | 
| 941 | 
            +
                const int il  = tid/4;     // 0...3
         | 
| 942 | 
            +
                const int ir  = tid - 4*il;// 0...3
         | 
| 943 | 
            +
                const int n   = 2;
         | 
| 944 | 
            +
             | 
| 945 | 
            +
                const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
         | 
| 946 | 
            +
                const int in = il%2;
         | 
| 947 | 
            +
             | 
| 948 | 
            +
                const int l0 = n*(2*ir + in);
         | 
| 949 | 
            +
                const int q_offset = 32*im + l0;
         | 
| 950 | 
            +
                const int y_offset = 64*im + l0;
         | 
| 951 | 
            +
             | 
| 952 | 
            +
                const uint8_t hm1  = 1 << (2*im);
         | 
| 953 | 
            +
                const uint8_t hm2  = hm1 << 4;
         | 
| 954 | 
            +
             | 
| 955 | 
            +
                uint16_t aux[4];
         | 
| 956 | 
            +
                const uint8_t * sc = (const uint8_t *)aux;
         | 
| 957 | 
            +
             | 
| 958 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += 2) {
         | 
| 959 | 
            +
             | 
| 960 | 
            +
                    const uint8_t * ql1 = x[i].qs + q_offset;
         | 
| 961 | 
            +
                    const uint8_t * ql2 = ql1 + 64;
         | 
| 962 | 
            +
                    const uint8_t * qh  = x[i].qh + l0;
         | 
| 963 | 
            +
                    const float   * y1  = yy + i*QK_K + y_offset;
         | 
| 964 | 
            +
                    const float   * y2  = y1 + 128;
         | 
| 965 | 
            +
             | 
| 966 | 
            +
                    const float dall = x[i].d;
         | 
| 967 | 
            +
                    const float dmin = x[i].dmin;
         | 
| 968 | 
            +
             | 
| 969 | 
            +
                    const uint16_t * a = (const uint16_t *)x[i].scales;
         | 
| 970 | 
            +
                    aux[0] = a[im+0] & kmask1;
         | 
| 971 | 
            +
                    aux[1] = a[im+2] & kmask1;
         | 
| 972 | 
            +
                    aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
         | 
| 973 | 
            +
                    aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
         | 
| 974 | 
            +
             | 
| 975 | 
            +
                    float4 sum = {0.f, 0.f, 0.f, 0.f};
         | 
| 976 | 
            +
                    float smin = 0;
         | 
| 977 | 
            +
                    for (int l = 0; l < n; ++l) {
         | 
| 978 | 
            +
                        sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
         | 
| 979 | 
            +
                               + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
         | 
| 980 | 
            +
                        sum.y += y1[l+32] * ((ql1[l+ 0] >>  4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
         | 
| 981 | 
            +
                               + y1[l+48] * ((ql1[l+16] >>  4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
         | 
| 982 | 
            +
                        sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
         | 
| 983 | 
            +
                               + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
         | 
| 984 | 
            +
                        sum.w += y2[l+32] * ((ql2[l+ 0] >>  4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
         | 
| 985 | 
            +
                               + y2[l+48] * ((ql2[l+16] >>  4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
         | 
| 986 | 
            +
                        smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
         | 
| 987 | 
            +
                              + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
         | 
| 988 | 
            +
                    }
         | 
| 989 | 
            +
                    tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
         | 
| 990 | 
            +
                }
         | 
| 991 | 
            +
             | 
| 992 | 
            +
            #else
         | 
| 993 | 
            +
                const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
         | 
| 994 | 
            +
                const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
         | 
| 995 | 
            +
                const int step = tid * K_QUANTS_PER_ITERATION;
         | 
| 996 | 
            +
                const int im = step/8;
         | 
| 997 | 
            +
                const int in = step%8;
         | 
| 998 | 
            +
             | 
| 999 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
         | 
| 1000 | 
            +
                    const uint8_t * q = x[i].qs + step;
         | 
| 1001 | 
            +
                    const int8_t  * s = x[i].scales;
         | 
| 1002 | 
            +
                    const float   * y = yy + i*QK_K + step;
         | 
| 1003 | 
            +
                    const float     d = x[i].d;
         | 
| 1004 | 
            +
                    float sum = 0.f;
         | 
| 1005 | 
            +
                    for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
         | 
| 1006 | 
            +
                        const uint8_t h = x[i].qh[in+j] >> im;
         | 
| 1007 | 
            +
                        sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
         | 
| 1008 | 
            +
                             + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
         | 
| 1009 | 
            +
                             + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
         | 
| 1010 | 
            +
                             + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
         | 
| 1011 | 
            +
                    }
         | 
| 1012 | 
            +
                    tmp += sum;
         | 
| 1013 | 
            +
                }
         | 
| 1014 | 
            +
            #endif
         | 
| 1015 | 
            +
             | 
| 1016 | 
            +
                // sum up partial sums and write back result
         | 
| 1017 | 
            +
                __syncthreads();
         | 
| 1018 | 
            +
            #pragma unroll
         | 
| 1019 | 
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         | 
| 1020 | 
            +
                    tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
         | 
| 1021 | 
            +
                }
         | 
| 1022 | 
            +
             | 
| 1023 | 
            +
                if (threadIdx.x == 0) {
         | 
| 1024 | 
            +
                    dst[row] = tmp;
         | 
| 1025 | 
            +
                }
         | 
| 1026 | 
            +
            }
         | 
| 1027 | 
            +
             | 
| 1028 | 
            +
            static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
         | 
| 1029 | 
            +
             | 
| 1030 | 
            +
                static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
         | 
| 1031 | 
            +
             | 
| 1032 | 
            +
                const int row = blockIdx.y*blockDim.y + threadIdx.y;
         | 
| 1033 | 
            +
                if (row > nrows) return;
         | 
| 1034 | 
            +
             | 
| 1035 | 
            +
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 1036 | 
            +
                const int ib0 = row*num_blocks_per_row;
         | 
| 1037 | 
            +
             | 
| 1038 | 
            +
                const block_q6_K * x = (const block_q6_K *)vx + ib0;
         | 
| 1039 | 
            +
             | 
| 1040 | 
            +
            #if QK_K == 256
         | 
| 1041 | 
            +
             | 
| 1042 | 
            +
                const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
         | 
| 1043 | 
            +
                const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
         | 
| 1044 | 
            +
             | 
| 1045 | 
            +
                const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
         | 
| 1046 | 
            +
             | 
| 1047 | 
            +
                const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
         | 
| 1048 | 
            +
                const int in = tid - step*im;                        // 0...15 or 0...7
         | 
| 1049 | 
            +
             | 
| 1050 | 
            +
            #if K_QUANTS_PER_ITERATION == 1
         | 
| 1051 | 
            +
                const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
         | 
| 1052 | 
            +
                const int is = 0;
         | 
| 1053 | 
            +
            #else
         | 
| 1054 | 
            +
                const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
         | 
| 1055 | 
            +
                const int is = in / 4;
         | 
| 1056 | 
            +
            #endif
         | 
| 1057 | 
            +
                const int ql_offset = 64*im + l0;
         | 
| 1058 | 
            +
                const int qh_offset = 32*im + l0;
         | 
| 1059 | 
            +
                const int s_offset  =  8*im + is;
         | 
| 1060 | 
            +
                const int y_offset = 128*im + l0;
         | 
| 1061 | 
            +
             | 
| 1062 | 
            +
                float tmp = 0; // partial sum for thread in warp
         | 
| 1063 | 
            +
             | 
| 1064 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
         | 
| 1065 | 
            +
             | 
| 1066 | 
            +
                    const float   * y  = yy + i * QK_K + y_offset;
         | 
| 1067 | 
            +
                    const uint8_t * ql = x[i].ql + ql_offset;
         | 
| 1068 | 
            +
                    const uint8_t * qh = x[i].qh + qh_offset;
         | 
| 1069 | 
            +
                    const int8_t  * s  = x[i].scales + s_offset;
         | 
| 1070 | 
            +
             | 
| 1071 | 
            +
                    const float d = x[i].d;
         | 
| 1072 | 
            +
             | 
| 1073 | 
            +
            #if K_QUANTS_PER_ITERATION == 1
         | 
| 1074 | 
            +
                    float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
         | 
| 1075 | 
            +
                              + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
         | 
| 1076 | 
            +
                              + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
         | 
| 1077 | 
            +
                              + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
         | 
| 1078 | 
            +
                              + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
         | 
| 1079 | 
            +
                              + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
         | 
| 1080 | 
            +
                              + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
         | 
| 1081 | 
            +
                              +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
         | 
| 1082 | 
            +
                    tmp += sum;
         | 
| 1083 | 
            +
            #else
         | 
| 1084 | 
            +
                    float sum = 0;
         | 
| 1085 | 
            +
                    for (int l = 0; l < 4; ++l) {
         | 
| 1086 | 
            +
                        sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
         | 
| 1087 | 
            +
                             + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
         | 
| 1088 | 
            +
                             + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
         | 
| 1089 | 
            +
                             + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
         | 
| 1090 | 
            +
                    }
         | 
| 1091 | 
            +
                    tmp += sum;
         | 
| 1092 | 
            +
            #endif
         | 
| 1093 | 
            +
             | 
| 1094 | 
            +
                }
         | 
| 1095 | 
            +
             | 
| 1096 | 
            +
            #else
         | 
| 1097 | 
            +
             | 
| 1098 | 
            +
                const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
         | 
| 1099 | 
            +
                const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
         | 
| 1100 | 
            +
             | 
| 1101 | 
            +
                const int step = tid * K_QUANTS_PER_ITERATION;
         | 
| 1102 | 
            +
             | 
| 1103 | 
            +
                float tmp = 0; // partial sum for thread in warp
         | 
| 1104 | 
            +
             | 
| 1105 | 
            +
                for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
         | 
| 1106 | 
            +
             | 
| 1107 | 
            +
                    const float   * y  = yy + i * QK_K + step;
         | 
| 1108 | 
            +
                    const uint8_t * ql = x[i].ql + step;
         | 
| 1109 | 
            +
                    const uint8_t * qh = x[i].qh + step;
         | 
| 1110 | 
            +
                    const int8_t  * s  = x[i].scales;
         | 
| 1111 | 
            +
             | 
| 1112 | 
            +
                    const float d = x[i+0].d;
         | 
| 1113 | 
            +
             | 
| 1114 | 
            +
                    float sum = 0;
         | 
| 1115 | 
            +
                    for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
         | 
| 1116 | 
            +
                        sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
         | 
| 1117 | 
            +
                             + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
         | 
| 1118 | 
            +
                             + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
         | 
| 1119 | 
            +
                             + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
         | 
| 1120 | 
            +
                    }
         | 
| 1121 | 
            +
                    tmp += sum;
         | 
| 1122 | 
            +
             | 
| 1123 | 
            +
                }
         | 
| 1124 | 
            +
             | 
| 1125 | 
            +
            #endif
         | 
| 1126 | 
            +
             | 
| 1127 | 
            +
                // sum up partial sums and write back result
         | 
| 1128 | 
            +
                __syncthreads();
         | 
| 1129 | 
            +
            #pragma unroll
         | 
| 1130 | 
            +
                for (int mask = 16; mask > 0; mask >>= 1) {
         | 
| 1131 | 
            +
                    tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
         | 
| 1132 | 
            +
                }
         | 
| 1133 | 
            +
             | 
| 1134 | 
            +
                if (tid == 0) {
         | 
| 1135 | 
            +
                    dst[row] = tmp;
         | 
| 1136 | 
            +
                }
         | 
| 1137 | 
            +
            }
         | 
| 1138 | 
            +
             | 
| 1139 | 
            +
            static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
         | 
| 637 1140 | 
             
                const half * x = (const half *) vx;
         | 
| 638 1141 |  | 
| 639 | 
            -
                 | 
| 640 | 
            -
                 | 
| 1142 | 
            +
                // automatic half -> float type cast if dfloat == float
         | 
| 1143 | 
            +
                v.x = x[ib + iqs + 0];
         | 
| 1144 | 
            +
                v.y = x[ib + iqs + 1];
         | 
| 641 1145 | 
             
            }
         | 
| 642 1146 |  | 
| 643 1147 | 
             
            template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
         | 
| @@ -654,13 +1158,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k) | |
| 654 1158 | 
             
                const int y_offset = qr == 1 ? 1 : qk/2;
         | 
| 655 1159 |  | 
| 656 1160 | 
             
                // dequantize
         | 
| 657 | 
            -
                 | 
| 658 | 
            -
                 | 
| 659 | 
            -
             | 
| 1161 | 
            +
                dfloat2 v;
         | 
| 1162 | 
            +
                dequantize_kernel(vx, ib, iqs, v);
         | 
| 1163 | 
            +
             | 
| 1164 | 
            +
                y[iybs + iqs + 0]        = v.x;
         | 
| 1165 | 
            +
                y[iybs + iqs + y_offset] = v.y;
         | 
| 660 1166 | 
             
            }
         | 
| 661 1167 |  | 
| 662 1168 | 
             
            template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
         | 
| 663 | 
            -
            static __global__ void dequantize_mul_mat_vec(const void * vx, const  | 
| 1169 | 
            +
            static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
         | 
| 664 1170 | 
             
                // qk = quantized weights per x block
         | 
| 665 1171 | 
             
                // qr = number of quantized weights per data value in x block
         | 
| 666 1172 | 
             
                const int row = blockIdx.y*blockDim.y + threadIdx.y;
         | 
| @@ -675,7 +1181,12 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, | |
| 675 1181 | 
             
                const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
         | 
| 676 1182 | 
             
                const int y_offset = qr == 1 ? 1 : qk/2;
         | 
| 677 1183 |  | 
| 678 | 
            -
             | 
| 1184 | 
            +
            // partial sum for each thread
         | 
| 1185 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 1186 | 
            +
                half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
         | 
| 1187 | 
            +
            #else
         | 
| 1188 | 
            +
                float tmp = 0.0f;
         | 
| 1189 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 679 1190 |  | 
| 680 1191 | 
             
                for (int i = 0; i < ncols; i += iter_stride) {
         | 
| 681 1192 | 
             
                    const int col = i + vals_per_iter*tid;
         | 
| @@ -689,14 +1200,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, | |
| 689 1200 | 
             
                        // process 2 vals per j iter
         | 
| 690 1201 |  | 
| 691 1202 | 
             
                        // dequantize
         | 
| 692 | 
            -
                        float v0, v1;
         | 
| 693 | 
            -
                        dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
         | 
| 694 1203 | 
             
                        // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
         | 
| 1204 | 
            +
                        dfloat2 v;
         | 
| 1205 | 
            +
                        dequantize_kernel(vx, ib, iqs + j/qr, v);
         | 
| 695 1206 |  | 
| 696 1207 | 
             
                        // matrix multiplication
         | 
| 697 | 
            -
                        tmp += v0 * y[iybs + iqs + j/qr + 0];
         | 
| 698 | 
            -
                        tmp += v1 * y[iybs + iqs + j/qr + y_offset];
         | 
| 699 1208 | 
             
                        // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
         | 
| 1209 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 1210 | 
            +
                        tmp += __hmul2(v, {
         | 
| 1211 | 
            +
                            y[iybs + iqs + j/qr + 0],
         | 
| 1212 | 
            +
                            y[iybs + iqs + j/qr + y_offset]
         | 
| 1213 | 
            +
                        });
         | 
| 1214 | 
            +
            #else
         | 
| 1215 | 
            +
                        tmp += v.x * y[iybs + iqs + j/qr + 0];
         | 
| 1216 | 
            +
                        tmp += v.y * y[iybs + iqs + j/qr + y_offset];
         | 
| 1217 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 700 1218 | 
             
                    }
         | 
| 701 1219 | 
             
                }
         | 
| 702 1220 |  | 
| @@ -708,47 +1226,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, | |
| 708 1226 | 
             
                }
         | 
| 709 1227 |  | 
| 710 1228 | 
             
                if (tid == 0) {
         | 
| 1229 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 1230 | 
            +
                    dst[row] = tmp.x + tmp.y;
         | 
| 1231 | 
            +
            #else
         | 
| 711 1232 | 
             
                    dst[row] = tmp;
         | 
| 712 | 
            -
             | 
| 713 | 
            -
            }
         | 
| 714 | 
            -
             | 
| 715 | 
            -
            template <int n_thread, dot_kernel_k_t dot_kernel>
         | 
| 716 | 
            -
            static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
         | 
| 717 | 
            -
                const int row = blockIdx.y*blockDim.y + threadIdx.y;
         | 
| 718 | 
            -
             | 
| 719 | 
            -
                if (row >= nrows) {
         | 
| 720 | 
            -
                    return;
         | 
| 721 | 
            -
                }
         | 
| 722 | 
            -
             | 
| 723 | 
            -
                const int tid = threadIdx.x;
         | 
| 724 | 
            -
             | 
| 725 | 
            -
                const int iter_stride = QK_K;
         | 
| 726 | 
            -
                const int vals_per_iter = iter_stride / n_thread;
         | 
| 727 | 
            -
                const int num_blocks_per_row = ncols / QK_K;
         | 
| 728 | 
            -
                const int ib0 = row*num_blocks_per_row;
         | 
| 729 | 
            -
             | 
| 730 | 
            -
                float tmp = 0; // partial sum for thread in warp
         | 
| 731 | 
            -
             | 
| 732 | 
            -
                for (int i = 0; i < ncols; i += iter_stride) {
         | 
| 733 | 
            -
                    const int col = i + vals_per_iter*tid;
         | 
| 734 | 
            -
                    const int ib = ib0 + col/QK_K; // x block index
         | 
| 735 | 
            -
                    const int iqs = col%QK_K; // x quant index
         | 
| 736 | 
            -
                    const int iybs = col - col%QK_K; // y block start index
         | 
| 737 | 
            -
             | 
| 738 | 
            -
                    float v;
         | 
| 739 | 
            -
                    dot_kernel(vx, ib, iqs, y + iybs, v);
         | 
| 740 | 
            -
                    tmp += v;
         | 
| 741 | 
            -
                }
         | 
| 742 | 
            -
             | 
| 743 | 
            -
                // sum up partial sums and write back result
         | 
| 744 | 
            -
                __syncthreads();
         | 
| 745 | 
            -
            #pragma unroll
         | 
| 746 | 
            -
                for (int mask = 16; mask > 0; mask >>= 1) {
         | 
| 747 | 
            -
                    tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
         | 
| 748 | 
            -
                }
         | 
| 749 | 
            -
             | 
| 750 | 
            -
                if (tid == 0) {
         | 
| 751 | 
            -
                    dst[row] = tmp;
         | 
| 1233 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 752 1234 | 
             
                }
         | 
| 753 1235 | 
             
            }
         | 
| 754 1236 |  | 
| @@ -1020,12 +1502,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu | |
| 1020 1502 |  | 
| 1021 1503 | 
             
            static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
         | 
| 1022 1504 | 
             
                const int nb = k / QK_K;
         | 
| 1505 | 
            +
            #if QK_K == 256
         | 
| 1023 1506 | 
             
                dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
         | 
| 1507 | 
            +
            #else
         | 
| 1508 | 
            +
                dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
         | 
| 1509 | 
            +
            #endif
         | 
| 1024 1510 | 
             
            }
         | 
| 1025 1511 |  | 
| 1026 1512 | 
             
            static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
         | 
| 1027 1513 | 
             
                const int nb = k / QK_K;
         | 
| 1514 | 
            +
            #if QK_K == 256
         | 
| 1028 1515 | 
             
                dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
         | 
| 1516 | 
            +
            #else
         | 
| 1517 | 
            +
                dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
         | 
| 1518 | 
            +
            #endif
         | 
| 1029 1519 | 
             
            }
         | 
| 1030 1520 |  | 
| 1031 1521 | 
             
            static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
         | 
| @@ -1035,15 +1525,23 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu | |
| 1035 1525 |  | 
| 1036 1526 | 
             
            static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
         | 
| 1037 1527 | 
             
                const int nb = k / QK_K;
         | 
| 1528 | 
            +
            #if QK_K == 256
         | 
| 1038 1529 | 
             
                dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
         | 
| 1530 | 
            +
            #else
         | 
| 1531 | 
            +
                dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
         | 
| 1532 | 
            +
            #endif
         | 
| 1039 1533 | 
             
            }
         | 
| 1040 1534 |  | 
| 1041 1535 | 
             
            static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
         | 
| 1042 1536 | 
             
                const int nb = k / QK_K;
         | 
| 1537 | 
            +
            #if QK_K == 256
         | 
| 1043 1538 | 
             
                dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
         | 
| 1539 | 
            +
            #else
         | 
| 1540 | 
            +
                dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
         | 
| 1541 | 
            +
            #endif
         | 
| 1044 1542 | 
             
            }
         | 
| 1045 1543 |  | 
| 1046 | 
            -
            static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const  | 
| 1544 | 
            +
            static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1047 1545 | 
             
                GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
         | 
| 1048 1546 | 
             
                const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
         | 
| 1049 1547 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| @@ -1052,7 +1550,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, f | |
| 1052 1550 | 
             
                    <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1053 1551 | 
             
            }
         | 
| 1054 1552 |  | 
| 1055 | 
            -
            static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const  | 
| 1553 | 
            +
            static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1056 1554 | 
             
                GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
         | 
| 1057 1555 | 
             
                const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
         | 
| 1058 1556 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| @@ -1061,7 +1559,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f | |
| 1061 1559 | 
             
                    <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1062 1560 | 
             
            }
         | 
| 1063 1561 |  | 
| 1064 | 
            -
            static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const  | 
| 1562 | 
            +
            static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1065 1563 | 
             
                GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
         | 
| 1066 1564 | 
             
                const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
         | 
| 1067 1565 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| @@ -1070,7 +1568,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, f | |
| 1070 1568 | 
             
                    <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1071 1569 | 
             
            }
         | 
| 1072 1570 |  | 
| 1073 | 
            -
            static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const  | 
| 1571 | 
            +
            static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1074 1572 | 
             
                GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
         | 
| 1075 1573 | 
             
                const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
         | 
| 1076 1574 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| @@ -1079,7 +1577,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, f | |
| 1079 1577 | 
             
                    <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1080 1578 | 
             
            }
         | 
| 1081 1579 |  | 
| 1082 | 
            -
            static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const  | 
| 1580 | 
            +
            static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1083 1581 | 
             
                GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
         | 
| 1084 1582 | 
             
                const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
         | 
| 1085 1583 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| @@ -1090,47 +1588,44 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f | |
| 1090 1588 |  | 
| 1091 1589 | 
             
            static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1092 1590 | 
             
                GGML_ASSERT(ncols % QK_K == 0);
         | 
| 1093 | 
            -
                const int ny = 2;
         | 
| 1591 | 
            +
                const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
         | 
| 1094 1592 | 
             
                const int block_num_y = (nrows + ny - 1) / ny;
         | 
| 1095 1593 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| 1096 1594 | 
             
                const dim3 block_dims(32, ny, 1);
         | 
| 1097 | 
            -
                 | 
| 1595 | 
            +
                dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1098 1596 | 
             
            }
         | 
| 1099 1597 |  | 
| 1100 1598 | 
             
            static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1101 1599 | 
             
                GGML_ASSERT(ncols % QK_K == 0);
         | 
| 1102 | 
            -
                const int ny = 2;
         | 
| 1600 | 
            +
                const int ny = 2 / K_QUANTS_PER_ITERATION;
         | 
| 1103 1601 | 
             
                const int block_num_y = (nrows + ny - 1) / ny;
         | 
| 1104 1602 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| 1105 1603 | 
             
                const dim3 block_dims(32, ny, 1);
         | 
| 1106 | 
            -
                 | 
| 1604 | 
            +
                dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1107 1605 | 
             
            }
         | 
| 1108 1606 |  | 
| 1109 1607 | 
             
            static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1110 1608 | 
             
                GGML_ASSERT(ncols % QK_K == 0);
         | 
| 1111 | 
            -
                const int ny = 2;
         | 
| 1609 | 
            +
                const int ny = 2 / K_QUANTS_PER_ITERATION;
         | 
| 1112 1610 | 
             
                const int block_num_y = (nrows + ny - 1) / ny;
         | 
| 1113 1611 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| 1114 1612 | 
             
                const dim3 block_dims(32, ny, 1);
         | 
| 1115 | 
            -
                 | 
| 1613 | 
            +
                dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1116 1614 | 
             
            }
         | 
| 1117 1615 |  | 
| 1118 1616 | 
             
            static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1119 1617 | 
             
                GGML_ASSERT(ncols % QK_K == 0);
         | 
| 1120 | 
            -
                const  | 
| 1121 | 
            -
                 | 
| 1122 | 
            -
                const dim3 block_nums(1, block_num_y, 1);
         | 
| 1123 | 
            -
                const dim3 block_dims(32, ny, 1);
         | 
| 1124 | 
            -
                dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1618 | 
            +
                const dim3 block_dims(32, 1, 1);
         | 
| 1619 | 
            +
                dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
         | 
| 1125 1620 | 
             
            }
         | 
| 1126 1621 |  | 
| 1127 1622 | 
             
            static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1128 1623 | 
             
                GGML_ASSERT(ncols % QK_K == 0);
         | 
| 1129 | 
            -
                const int ny = 2;
         | 
| 1624 | 
            +
                const int ny = 2 / K_QUANTS_PER_ITERATION;
         | 
| 1130 1625 | 
             
                const int block_num_y = (nrows + ny - 1) / ny;
         | 
| 1131 1626 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| 1132 1627 | 
             
                const dim3 block_dims(32, ny, 1);
         | 
| 1133 | 
            -
                 | 
| 1628 | 
            +
                dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
         | 
| 1134 1629 | 
             
            }
         | 
| 1135 1630 |  | 
| 1136 1631 | 
             
            static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
         | 
| @@ -1138,7 +1633,7 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c | |
| 1138 1633 | 
             
                dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
         | 
| 1139 1634 | 
             
            }
         | 
| 1140 1635 |  | 
| 1141 | 
            -
            static void convert_mul_mat_vec_f16_cuda(const void * vx, const  | 
| 1636 | 
            +
            static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
         | 
| 1142 1637 | 
             
                GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
         | 
| 1143 1638 | 
             
                const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
         | 
| 1144 1639 | 
             
                const dim3 block_nums(1, block_num_y, 1);
         | 
| @@ -1306,19 +1801,13 @@ static void * g_scratch_buffer = nullptr; | |
| 1306 1801 | 
             
            static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
         | 
| 1307 1802 | 
             
            static size_t g_scratch_offset = 0;
         | 
| 1308 1803 |  | 
| 1309 | 
            -
            #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
         | 
| 1310 | 
            -
            #define GGML_CUDA_MAX_EVENTS 64
         | 
| 1311 | 
            -
             | 
| 1312 1804 | 
             
            static int g_device_count = -1;
         | 
| 1313 1805 | 
             
            static int g_main_device = 0;
         | 
| 1314 1806 | 
             
            static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
         | 
| 1315 1807 |  | 
| 1316 1808 | 
             
            static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
         | 
| 1317 1809 |  | 
| 1318 | 
            -
            static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] | 
| 1319 | 
            -
             | 
| 1320 | 
            -
            static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
         | 
| 1321 | 
            -
            static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
         | 
| 1810 | 
            +
            static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
         | 
| 1322 1811 |  | 
| 1323 1812 | 
             
            void ggml_init_cublas() {
         | 
| 1324 1813 | 
             
                static bool initialized = false;
         | 
| @@ -1342,15 +1831,8 @@ void ggml_init_cublas() { | |
| 1342 1831 | 
             
                    for (int id = 0; id < g_device_count; ++id) {
         | 
| 1343 1832 | 
             
                        CUDA_CHECK(cudaSetDevice(id));
         | 
| 1344 1833 |  | 
| 1345 | 
            -
                        // create  | 
| 1346 | 
            -
                         | 
| 1347 | 
            -
                            CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
         | 
| 1348 | 
            -
                            CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
         | 
| 1349 | 
            -
                        }
         | 
| 1350 | 
            -
                        // create events
         | 
| 1351 | 
            -
                        for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
         | 
| 1352 | 
            -
                            CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
         | 
| 1353 | 
            -
                        }
         | 
| 1834 | 
            +
                        // create main stream
         | 
| 1835 | 
            +
                        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
         | 
| 1354 1836 |  | 
| 1355 1837 | 
             
                        // create cublas handle
         | 
| 1356 1838 | 
             
                        CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
         | 
| @@ -1566,21 +2048,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( | |
| 1566 2048 | 
             
                const int64_t ne00 = src0->ne[0];
         | 
| 1567 2049 | 
             
                const int64_t nrows = i01_high - i01_low;
         | 
| 1568 2050 |  | 
| 2051 | 
            +
            // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
         | 
| 2052 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 2053 | 
            +
                size_t ash;
         | 
| 2054 | 
            +
                dfloat * src1_dfloat = nullptr; // dfloat == half
         | 
| 2055 | 
            +
             | 
| 2056 | 
            +
                bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
         | 
| 2057 | 
            +
                    src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
         | 
| 2058 | 
            +
                    src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
         | 
| 2059 | 
            +
             | 
| 2060 | 
            +
                if (src1_convert_f16) {
         | 
| 2061 | 
            +
                    src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
         | 
| 2062 | 
            +
                    ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
         | 
| 2063 | 
            +
                                            ne00, 1, sizeof(float), 0, 0,
         | 
| 2064 | 
            +
                                            ne00, 1, sizeof(half),  0, 0, cudaStream_main);
         | 
| 2065 | 
            +
                }
         | 
| 2066 | 
            +
            #else
         | 
| 2067 | 
            +
                dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
         | 
| 2068 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 2069 | 
            +
             | 
| 1569 2070 | 
             
                switch (src0->type) {
         | 
| 1570 2071 | 
             
                    case GGML_TYPE_Q4_0:
         | 
| 1571 | 
            -
                        dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i,  | 
| 2072 | 
            +
                        dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
         | 
| 1572 2073 | 
             
                        break;
         | 
| 1573 2074 | 
             
                    case GGML_TYPE_Q4_1:
         | 
| 1574 | 
            -
                        dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i,  | 
| 2075 | 
            +
                        dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
         | 
| 1575 2076 | 
             
                        break;
         | 
| 1576 2077 | 
             
                    case GGML_TYPE_Q5_0:
         | 
| 1577 | 
            -
                        dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i,  | 
| 2078 | 
            +
                        dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
         | 
| 1578 2079 | 
             
                        break;
         | 
| 1579 2080 | 
             
                    case GGML_TYPE_Q5_1:
         | 
| 1580 | 
            -
                        dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i,  | 
| 2081 | 
            +
                        dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
         | 
| 1581 2082 | 
             
                        break;
         | 
| 1582 2083 | 
             
                    case GGML_TYPE_Q8_0:
         | 
| 1583 | 
            -
                        dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i,  | 
| 2084 | 
            +
                        dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
         | 
| 1584 2085 | 
             
                        break;
         | 
| 1585 2086 | 
             
                    case GGML_TYPE_Q2_K:
         | 
| 1586 2087 | 
             
                        dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
         | 
| @@ -1598,7 +2099,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( | |
| 1598 2099 | 
             
                        dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
         | 
| 1599 2100 | 
             
                        break;
         | 
| 1600 2101 | 
             
                    case GGML_TYPE_F16:
         | 
| 1601 | 
            -
                        convert_mul_mat_vec_f16_cuda(src0_ddq_i,  | 
| 2102 | 
            +
                        convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
         | 
| 1602 2103 | 
             
                        break;
         | 
| 1603 2104 | 
             
                    default:
         | 
| 1604 2105 | 
             
                        GGML_ASSERT(false);
         | 
| @@ -1606,6 +2107,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( | |
| 1606 2107 | 
             
                }
         | 
| 1607 2108 | 
             
                CUDA_CHECK(cudaGetLastError());
         | 
| 1608 2109 |  | 
| 2110 | 
            +
            #ifdef GGML_CUDA_DMMV_F16
         | 
| 2111 | 
            +
                if (src1_convert_f16) {
         | 
| 2112 | 
            +
                    ggml_cuda_pool_free(src1_dfloat, ash);
         | 
| 2113 | 
            +
                }
         | 
| 2114 | 
            +
            #endif // GGML_CUDA_DMMV_F16
         | 
| 2115 | 
            +
             | 
| 1609 2116 | 
             
                (void) src1;
         | 
| 1610 2117 | 
             
                (void) dst;
         | 
| 1611 2118 | 
             
                (void) src0_ddf_i;
         | 
| @@ -1817,6 +2324,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm | |
| 1817 2324 | 
             
                size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
         | 
| 1818 2325 | 
             
                size_t  dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
         | 
| 1819 2326 |  | 
| 2327 | 
            +
                // if multiple GPUs are used they need to wait for the main GPU to finish
         | 
| 2328 | 
            +
                if (split && g_device_count > 1) {
         | 
| 2329 | 
            +
                    CUDA_CHECK(cudaSetDevice(g_main_device));
         | 
| 2330 | 
            +
                    CUDA_CHECK(cudaDeviceSynchronize());
         | 
| 2331 | 
            +
                }
         | 
| 2332 | 
            +
             | 
| 1820 2333 | 
             
                for (int id = 0; id < g_device_count; ++id) {
         | 
| 1821 2334 | 
             
                    if (!split && id != g_main_device) {
         | 
| 1822 2335 | 
             
                        continue;
         | 
| @@ -1915,9 +2428,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm | |
| 1915 2428 | 
             
                            }
         | 
| 1916 2429 | 
             
                            const int64_t i11 = i13*ne12 + i12;
         | 
| 1917 2430 |  | 
| 1918 | 
            -
                            cudaStream_t cudaStream_main | 
| 1919 | 
            -
                            cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
         | 
| 1920 | 
            -
                            cudaEvent_t  cudaEvent_memcpy_src1  =  g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
         | 
| 2431 | 
            +
                            cudaStream_t cudaStream_main = g_cudaStreams_main[id];
         | 
| 1921 2432 |  | 
| 1922 2433 | 
             
                            // for split tensors the data begins at i0 == i0_offset_low
         | 
| 1923 2434 | 
             
                            char  * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
         | 
| @@ -1945,14 +2456,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm | |
| 1945 2456 | 
             
                                if (src1->backend == GGML_BACKEND_CPU) {
         | 
| 1946 2457 | 
             
                                    GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
         | 
| 1947 2458 | 
             
                                    int64_t nrows1 = flatten_rows ? nrows0 : ne11;
         | 
| 1948 | 
            -
                                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1,  | 
| 2459 | 
            +
                                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
         | 
| 1949 2460 | 
             
                                } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
         | 
| 1950 2461 | 
             
                                    if (id != g_main_device) {
         | 
| 1951 2462 | 
             
                                        GGML_ASSERT(!flatten_rows);
         | 
| 1952 2463 | 
             
                                        float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
         | 
| 1953 2464 | 
             
                                        src1_ddf_i_source += i11*src1_stride;
         | 
| 1954 2465 | 
             
                                        CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
         | 
| 1955 | 
            -
                                                                cudaMemcpyDeviceToDevice,  | 
| 2466 | 
            +
                                                                cudaMemcpyDeviceToDevice, cudaStream_main));
         | 
| 1956 2467 | 
             
                                    }
         | 
| 1957 2468 | 
             
                                } else if (src1_on_device && !src1_is_contiguous) {
         | 
| 1958 2469 | 
             
                                    GGML_ASSERT(!split);
         | 
| @@ -1961,7 +2472,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm | |
| 1961 2472 | 
             
                                    GGML_ASSERT(false);
         | 
| 1962 2473 | 
             
                                }
         | 
| 1963 2474 | 
             
                            }
         | 
| 1964 | 
            -
                            CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
         | 
| 1965 2475 |  | 
| 1966 2476 | 
             
                            if (!src0_on_device || !src0_is_contiguous) {
         | 
| 1967 2477 | 
             
                                if (src0_is_f32) {
         | 
| @@ -1977,9 +2487,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm | |
| 1977 2487 | 
             
                                CUDA_CHECK(cudaGetLastError());
         | 
| 1978 2488 | 
             
                            }
         | 
| 1979 2489 |  | 
| 1980 | 
            -
                            // wait with main stream until src1 memcpy is done
         | 
| 1981 | 
            -
                            CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
         | 
| 1982 | 
            -
             | 
| 1983 2490 | 
             
                            // do the computation
         | 
| 1984 2491 | 
             
                            op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
         | 
| 1985 2492 |  | 
| @@ -2017,8 +2524,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm | |
| 2017 2524 |  | 
| 2018 2525 | 
             
                // wait until each device is finished, then free their buffers
         | 
| 2019 2526 | 
             
                for (int id = 0; id < g_device_count; ++id) {
         | 
| 2527 | 
            +
                    if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
         | 
| 2528 | 
            +
                        continue;
         | 
| 2529 | 
            +
                    }
         | 
| 2530 | 
            +
             | 
| 2020 2531 | 
             
                    CUDA_CHECK(cudaSetDevice(id));
         | 
| 2021 2532 | 
             
                    CUDA_CHECK(cudaDeviceSynchronize());
         | 
| 2533 | 
            +
             | 
| 2022 2534 | 
             
                    if (src0_asq[id] > 0) {
         | 
| 2023 2535 | 
             
                        ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
         | 
| 2024 2536 | 
             
                    }
         | 
| @@ -2084,7 +2596,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr | |
| 2084 2596 | 
             
                const int64_t ne02 = src0->ne[2];
         | 
| 2085 2597 |  | 
| 2086 2598 | 
             
                CUDA_CHECK(cudaSetDevice(g_main_device));
         | 
| 2087 | 
            -
                cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device] | 
| 2599 | 
            +
                cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
         | 
| 2088 2600 |  | 
| 2089 2601 | 
             
                struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
         | 
| 2090 2602 | 
             
                void * src0_ddq = src0_extra->data_device[g_main_device];
         | 
| @@ -2096,8 +2608,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr | |
| 2096 2608 | 
             
                float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
         | 
| 2097 2609 |  | 
| 2098 2610 | 
             
                ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
         | 
| 2099 | 
            -
             | 
| 2100 | 
            -
                CUDA_CHECK(cudaDeviceSynchronize());
         | 
| 2101 2611 | 
             
            }
         | 
| 2102 2612 |  | 
| 2103 2613 | 
             
            void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
         | 
| @@ -2115,7 +2625,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 | |
| 2115 2625 | 
             
                const int64_t nb02 = src0->nb[2];
         | 
| 2116 2626 |  | 
| 2117 2627 | 
             
                CUDA_CHECK(cudaSetDevice(g_main_device));
         | 
| 2118 | 
            -
                cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device] | 
| 2628 | 
            +
                cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
         | 
| 2119 2629 |  | 
| 2120 2630 | 
             
                struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
         | 
| 2121 2631 | 
             
                void * src0_ddq = src0_extra->data_device[g_main_device];
         | 
| @@ -2130,8 +2640,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 | |
| 2130 2640 | 
             
                const int channel_stride_x = nb02 / sizeof(half);
         | 
| 2131 2641 |  | 
| 2132 2642 | 
             
                ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
         | 
| 2133 | 
            -
             | 
| 2134 | 
            -
                CUDA_CHECK(cudaDeviceSynchronize());
         | 
| 2135 2643 | 
             
            }
         | 
| 2136 2644 |  | 
| 2137 2645 | 
             
            void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
         | 
| @@ -2187,7 +2695,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens | |
| 2187 2695 | 
             
                const int64_t nb12 = src1->nb[2];
         | 
| 2188 2696 |  | 
| 2189 2697 | 
             
                CUDA_CHECK(cudaSetDevice(g_main_device));
         | 
| 2190 | 
            -
                cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device] | 
| 2698 | 
            +
                cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
         | 
| 2191 2699 |  | 
| 2192 2700 | 
             
                const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
         | 
| 2193 2701 | 
             
                const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
         | 
| @@ -2205,8 +2713,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens | |
| 2205 2713 | 
             
                    GGML_ASSERT(false);
         | 
| 2206 2714 | 
             
                }
         | 
| 2207 2715 |  | 
| 2208 | 
            -
                CUDA_CHECK(cudaDeviceSynchronize());
         | 
| 2209 | 
            -
             | 
| 2210 2716 | 
             
                (void) dst;
         | 
| 2211 2717 | 
             
            }
         | 
| 2212 2718 |  | 
| @@ -2313,6 +2819,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) { | |
| 2313 2819 |  | 
| 2314 2820 | 
             
                tensor->backend = GGML_BACKEND_GPU;
         | 
| 2315 2821 | 
             
                struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
         | 
| 2822 | 
            +
                memset(extra, 0, sizeof(*extra));
         | 
| 2316 2823 |  | 
| 2317 2824 | 
             
                const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
         | 
| 2318 2825 | 
             
                    tensor->op == GGML_OP_VIEW;
         | 
| @@ -2395,7 +2902,7 @@ void ggml_cuda_free_scratch() { | |
| 2395 2902 | 
             
            bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
         | 
| 2396 2903 | 
             
                ggml_cuda_func_t func;
         | 
| 2397 2904 | 
             
                const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
         | 
| 2398 | 
            -
                    || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
         | 
| 2905 | 
            +
                    || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
         | 
| 2399 2906 | 
             
                    || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
         | 
| 2400 2907 |  | 
| 2401 2908 | 
             
                switch (tensor->op) {
         |