llama_cpp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
| @@ -7,7 +7,13 @@ | |
| 7 7 | 
             
            #include <stddef.h>
         | 
| 8 8 |  | 
| 9 9 | 
             
            // Super-block size
         | 
| 10 | 
            +
            #ifdef GGML_QKK_64
         | 
| 11 | 
            +
            #define QK_K 64
         | 
| 12 | 
            +
            #define K_SCALE_SIZE 4
         | 
| 13 | 
            +
            #else
         | 
| 10 14 | 
             
            #define QK_K 256
         | 
| 15 | 
            +
            #define K_SCALE_SIZE 12
         | 
| 16 | 
            +
            #endif
         | 
| 11 17 |  | 
| 12 18 | 
             
            //
         | 
| 13 19 | 
             
            // Super-block quantization structures
         | 
| @@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w | |
| 29 35 | 
             
            // weight is represented as x = a * q
         | 
| 30 36 | 
             
            // 16 blocks of 16 elemenets each
         | 
| 31 37 | 
             
            // Effectively 3.4375 bits per weight
         | 
| 38 | 
            +
            #ifdef GGML_QKK_64
         | 
| 32 39 | 
             
            typedef struct {
         | 
| 33 40 | 
             
                uint8_t hmask[QK_K/8];     // quants - high bit
         | 
| 34 41 | 
             
                uint8_t qs[QK_K/4];        // quants - low 2 bits
         | 
| 35 | 
            -
                uint8_t scales[ | 
| 42 | 
            +
                uint8_t scales[2];
         | 
| 36 43 | 
             
                ggml_fp16_t d;             // super-block scale
         | 
| 37 44 | 
             
            } block_q3_K;
         | 
| 38 | 
            -
            static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 +  | 
| 45 | 
            +
            static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
         | 
| 46 | 
            +
            #else
         | 
| 47 | 
            +
            typedef struct {
         | 
| 48 | 
            +
                uint8_t hmask[QK_K/8];     // quants - high bit
         | 
| 49 | 
            +
                uint8_t qs[QK_K/4];        // quants - low 2 bits
         | 
| 50 | 
            +
                uint8_t scales[12];        // scales, quantized with 6 bits
         | 
| 51 | 
            +
                ggml_fp16_t d;             // super-block scale
         | 
| 52 | 
            +
            } block_q3_K;
         | 
| 53 | 
            +
            static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
         | 
| 54 | 
            +
            #endif
         | 
| 39 55 |  | 
| 40 56 | 
             
            // 4-bit quantization
         | 
| 41 57 | 
             
            // 16 blocks of 32 elements each
         | 
| 42 58 | 
             
            // weight is represented as x = a * q + b
         | 
| 43 59 | 
             
            // Effectively 4.5 bits per weight
         | 
| 60 | 
            +
            #ifdef GGML_QKK_64
         | 
| 61 | 
            +
            typedef struct {
         | 
| 62 | 
            +
                ggml_fp16_t d[2];          // super-block scales/mins
         | 
| 63 | 
            +
                uint8_t scales[2];         // 4-bit block scales/mins
         | 
| 64 | 
            +
                uint8_t qs[QK_K/2];        // 4--bit quants
         | 
| 65 | 
            +
            } block_q4_K;
         | 
| 66 | 
            +
            static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
         | 
| 67 | 
            +
            #else
         | 
| 44 68 | 
             
            typedef struct {
         | 
| 45 69 | 
             
                ggml_fp16_t d;             // super-block scale for quantized scales
         | 
| 46 70 | 
             
                ggml_fp16_t dmin;          // super-block scale for quantized mins
         | 
| 47 | 
            -
                uint8_t scales[ | 
| 71 | 
            +
                uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
         | 
| 48 72 | 
             
                uint8_t qs[QK_K/2];        // 4--bit quants
         | 
| 49 73 | 
             
            } block_q4_K;
         | 
| 50 | 
            -
            static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) +  | 
| 74 | 
            +
            static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
         | 
| 75 | 
            +
            #endif
         | 
| 51 76 |  | 
| 52 77 | 
             
            // 5-bit quantization
         | 
| 53 78 | 
             
            // 16 blocks of 32 elements each
         | 
| 54 79 | 
             
            // weight is represented as x = a * q + b
         | 
| 55 80 | 
             
            // Effectively 5.5 bits per weight
         | 
| 81 | 
            +
            #ifdef GGML_QKK_64
         | 
| 82 | 
            +
            typedef struct {
         | 
| 83 | 
            +
                ggml_fp16_t d;               // super-block scale
         | 
| 84 | 
            +
                int8_t  scales[QK_K/16];     // 8-bit block scales
         | 
| 85 | 
            +
                uint8_t qh[QK_K/8];          // quants, high bit
         | 
| 86 | 
            +
                uint8_t qs[QK_K/2];          // quants, low 4 bits
         | 
| 87 | 
            +
            } block_q5_K;
         | 
| 88 | 
            +
            static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
         | 
| 89 | 
            +
            #else
         | 
| 56 90 | 
             
            typedef struct {
         | 
| 57 91 | 
             
                ggml_fp16_t d;               // super-block scale for quantized scales
         | 
| 58 92 | 
             
                ggml_fp16_t dmin;            // super-block scale for quantized mins
         | 
| 59 | 
            -
                uint8_t scales[ | 
| 93 | 
            +
                uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
         | 
| 60 94 | 
             
                uint8_t qh[QK_K/8];          // quants, high bit
         | 
| 61 95 | 
             
                uint8_t qs[QK_K/2];          // quants, low 4 bits
         | 
| 62 96 | 
             
            } block_q5_K;
         | 
| 63 | 
            -
            static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) +  | 
| 97 | 
            +
            static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
         | 
| 98 | 
            +
            #endif
         | 
| 64 99 |  | 
| 65 100 | 
             
            // 6-bit quantization
         | 
| 66 101 | 
             
            // weight is represented as x = a * q
         | 
| @@ -172,12 +172,14 @@ struct llama_mmap { | |
| 172 172 | 
             
            #ifdef _POSIX_MAPPED_FILES
         | 
| 173 173 | 
             
                static constexpr bool SUPPORTED = true;
         | 
| 174 174 |  | 
| 175 | 
            -
                llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value  | 
| 175 | 
            +
                llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
         | 
| 176 176 | 
             
                    size = file->size;
         | 
| 177 177 | 
             
                    int fd = fileno(file->fp);
         | 
| 178 178 | 
             
                    int flags = MAP_SHARED;
         | 
| 179 | 
            +
                    // prefetch/readahead impairs performance on NUMA systems
         | 
| 180 | 
            +
                    if (numa) { prefetch = 0; }
         | 
| 179 181 | 
             
            #ifdef __linux__
         | 
| 180 | 
            -
                    flags |= MAP_POPULATE;
         | 
| 182 | 
            +
                    if (prefetch) { flags |= MAP_POPULATE; }
         | 
| 181 183 | 
             
            #endif
         | 
| 182 184 | 
             
                    addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
         | 
| 183 185 | 
             
                    if (addr == MAP_FAILED) {
         | 
| @@ -191,6 +193,14 @@ struct llama_mmap { | |
| 191 193 | 
             
                                    strerror(errno));
         | 
| 192 194 | 
             
                        }
         | 
| 193 195 | 
             
                    }
         | 
| 196 | 
            +
                    if (numa) {
         | 
| 197 | 
            +
                        // advise the kernel not to use readahead
         | 
| 198 | 
            +
                        // (because the next page might not belong on the same node)
         | 
| 199 | 
            +
                        if (madvise(addr, file->size, MADV_RANDOM)) {
         | 
| 200 | 
            +
                            fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
         | 
| 201 | 
            +
                                    strerror(errno));
         | 
| 202 | 
            +
                        }
         | 
| 203 | 
            +
                    }
         | 
| 194 204 | 
             
                }
         | 
| 195 205 |  | 
| 196 206 | 
             
                ~llama_mmap() {
         | 
| @@ -199,7 +209,9 @@ struct llama_mmap { | |
| 199 209 | 
             
            #elif defined(_WIN32)
         | 
| 200 210 | 
             
                static constexpr bool SUPPORTED = true;
         | 
| 201 211 |  | 
| 202 | 
            -
                llama_mmap(struct llama_file * file, bool prefetch = true) {
         | 
| 212 | 
            +
                llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
         | 
| 213 | 
            +
                    (void) numa;
         | 
| 214 | 
            +
             | 
| 203 215 | 
             
                    size = file->size;
         | 
| 204 216 |  | 
| 205 217 | 
             
                    HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
         | 
| @@ -244,8 +256,10 @@ struct llama_mmap { | |
| 244 256 | 
             
            #else
         | 
| 245 257 | 
             
                static constexpr bool SUPPORTED = false;
         | 
| 246 258 |  | 
| 247 | 
            -
                llama_mmap(struct llama_file *, bool prefetch = true) {
         | 
| 248 | 
            -
                    (void)prefetch;
         | 
| 259 | 
            +
                llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
         | 
| 260 | 
            +
                    (void) prefetch;
         | 
| 261 | 
            +
                    (void) numa;
         | 
| 262 | 
            +
             | 
| 249 263 | 
             
                    throw std::runtime_error(std::string("mmap not supported"));
         | 
| 250 264 | 
             
                }
         | 
| 251 265 | 
             
            #endif
         |