llama_cpp 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -7
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +1028 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +568 -57
- data/ext/llama_cpp/src/ggml.h +21 -2
- data/ext/llama_cpp/src/llama.cpp +37 -2
- data/ext/llama_cpp/src/llama.h +5 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -474
    
        data/ext/llama_cpp/src/ggml.h
    CHANGED
    
    | @@ -198,6 +198,7 @@ | |
| 198 198 | 
             
            #define GGML_MAX_PARAMS        256
         | 
| 199 199 | 
             
            #define GGML_MAX_CONTEXTS      64
         | 
| 200 200 | 
             
            #define GGML_MAX_OPT           4
         | 
| 201 | 
            +
            #define GGML_MAX_NAME          32
         | 
| 201 202 | 
             
            #define GGML_DEFAULT_N_THREADS 4
         | 
| 202 203 |  | 
| 203 204 | 
             
            #define GGML_ASSERT(x) \
         | 
| @@ -249,6 +250,7 @@ extern "C" { | |
| 249 250 | 
             
                enum ggml_backend {
         | 
| 250 251 | 
             
                    GGML_BACKEND_CPU = 0,
         | 
| 251 252 | 
             
                    GGML_BACKEND_CUDA = 1,
         | 
| 253 | 
            +
                    GGML_BACKEND_CL = 2,
         | 
| 252 254 | 
             
                };
         | 
| 253 255 |  | 
| 254 256 | 
             
                // model file types
         | 
| @@ -371,11 +373,13 @@ extern "C" { | |
| 371 373 |  | 
| 372 374 | 
             
                    void * data;
         | 
| 373 375 |  | 
| 374 | 
            -
                    char name[ | 
| 376 | 
            +
                    char name[GGML_MAX_NAME];
         | 
| 375 377 |  | 
| 376 378 | 
             
                    char padding[16];
         | 
| 377 379 | 
             
                };
         | 
| 378 380 |  | 
| 381 | 
            +
                static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
         | 
| 382 | 
            +
             | 
| 379 383 | 
             
                // computation graph
         | 
| 380 384 | 
             
                struct ggml_cgraph {
         | 
| 381 385 | 
             
                    int n_nodes;
         | 
| @@ -428,6 +432,7 @@ extern "C" { | |
| 428 432 | 
             
                GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
         | 
| 429 433 |  | 
| 430 434 | 
             
                GGML_API const char * ggml_type_name(enum ggml_type type);
         | 
| 435 | 
            +
                GGML_API const char * ggml_op_name  (enum ggml_op   op);
         | 
| 431 436 |  | 
| 432 437 | 
             
                GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
         | 
| 433 438 |  | 
| @@ -436,6 +441,9 @@ extern "C" { | |
| 436 441 | 
             
                // TODO: temporary until model loading of ggml examples is refactored
         | 
| 437 442 | 
             
                GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
         | 
| 438 443 |  | 
| 444 | 
            +
                // use this to compute the memory overhead of a tensor
         | 
| 445 | 
            +
                GGML_API size_t ggml_tensor_overhead(void);
         | 
| 446 | 
            +
             | 
| 439 447 | 
             
                // main
         | 
| 440 448 |  | 
| 441 449 | 
             
                GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
         | 
| @@ -443,7 +451,11 @@ extern "C" { | |
| 443 451 |  | 
| 444 452 | 
             
                GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
         | 
| 445 453 |  | 
| 446 | 
            -
                GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
         | 
| 454 | 
            +
                GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
         | 
| 455 | 
            +
                GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
         | 
| 456 | 
            +
             | 
| 457 | 
            +
                GGML_API void *  ggml_get_mem_buffer(struct ggml_context * ctx);
         | 
| 458 | 
            +
                GGML_API size_t  ggml_get_mem_size  (struct ggml_context * ctx);
         | 
| 447 459 |  | 
| 448 460 | 
             
                GGML_API struct ggml_tensor * ggml_new_tensor(
         | 
| 449 461 | 
             
                        struct ggml_context * ctx,
         | 
| @@ -483,6 +495,8 @@ extern "C" { | |
| 483 495 | 
             
                GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
         | 
| 484 496 | 
             
                GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
         | 
| 485 497 |  | 
| 498 | 
            +
                GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
         | 
| 499 | 
            +
             | 
| 486 500 | 
             
                GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
         | 
| 487 501 | 
             
                GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
         | 
| 488 502 | 
             
                GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
         | 
| @@ -969,6 +983,11 @@ extern "C" { | |
| 969 983 | 
             
                GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
         | 
| 970 984 | 
             
                GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
         | 
| 971 985 |  | 
| 986 | 
            +
                GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
         | 
| 987 | 
            +
             | 
| 988 | 
            +
                GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
         | 
| 989 | 
            +
                GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
         | 
| 990 | 
            +
             | 
| 972 991 | 
             
                // print info and performance information for the graph
         | 
| 973 992 | 
             
                GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
         | 
| 974 993 |  | 
    
        data/ext/llama_cpp/src/llama.cpp
    CHANGED
    
    | @@ -12,6 +12,8 @@ | |
| 12 12 | 
             
            #include "ggml.h"
         | 
| 13 13 | 
             
            #ifdef GGML_USE_CUBLAS
         | 
| 14 14 | 
             
            #include "ggml-cuda.h"
         | 
| 15 | 
            +
            #elif defined(GGML_USE_CLBLAST)
         | 
| 16 | 
            +
            #include "ggml-opencl.h"
         | 
| 15 17 | 
             
            #endif
         | 
| 16 18 |  | 
| 17 19 | 
             
            #include <array>
         | 
| @@ -40,6 +42,7 @@ | |
| 40 42 | 
             
            // available llama models
         | 
| 41 43 | 
             
            enum e_model {
         | 
| 42 44 | 
             
                MODEL_UNKNOWN,
         | 
| 45 | 
            +
                MODEL_3B,
         | 
| 43 46 | 
             
                MODEL_7B,
         | 
| 44 47 | 
             
                MODEL_13B,
         | 
| 45 48 | 
             
                MODEL_30B,
         | 
| @@ -56,6 +59,7 @@ static const size_t MB = 1024*1024; | |
| 56 59 | 
             
            static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
         | 
| 57 60 | 
             
            {
         | 
| 58 61 | 
             
                static std::map<e_model, size_t> k_sizes = {
         | 
| 62 | 
            +
                    { MODEL_3B,    128ull * MB },
         | 
| 59 63 | 
             
                    { MODEL_7B,    512ull * MB },
         | 
| 60 64 | 
             
                    { MODEL_13B,   512ull * MB },
         | 
| 61 65 | 
             
                    { MODEL_30B,   512ull * MB },
         | 
| @@ -67,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0() | |
| 67 71 | 
             
            static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
         | 
| 68 72 | 
             
            {
         | 
| 69 73 | 
             
                static std::map<e_model, size_t> k_sizes = {
         | 
| 74 | 
            +
                    { MODEL_3B,    128ull * MB },
         | 
| 70 75 | 
             
                    { MODEL_7B,    512ull * MB },
         | 
| 71 76 | 
             
                    { MODEL_13B,   512ull * MB },
         | 
| 72 77 | 
             
                    { MODEL_30B,   512ull * MB },
         | 
| @@ -79,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1() | |
| 79 84 | 
             
            static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
         | 
| 80 85 | 
             
            {
         | 
| 81 86 | 
             
                static std::map<e_model, size_t> k_sizes = {
         | 
| 87 | 
            +
                    { MODEL_3B,    682ull * MB },
         | 
| 82 88 | 
             
                    { MODEL_7B,   1026ull * MB },
         | 
| 83 89 | 
             
                    { MODEL_13B,  1608ull * MB },
         | 
| 84 90 | 
             
                    { MODEL_30B,  3124ull * MB },
         | 
| @@ -92,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF() | |
| 92 98 | 
             
            static const std::map<e_model, size_t> & MEM_REQ_EVAL()
         | 
| 93 99 | 
             
            {
         | 
| 94 100 | 
             
                static std::map<e_model, size_t> k_sizes = {
         | 
| 101 | 
            +
                    { MODEL_3B,   512ull * MB },
         | 
| 95 102 | 
             
                    { MODEL_7B,   768ull * MB },
         | 
| 96 103 | 
             
                    { MODEL_13B, 1024ull * MB },
         | 
| 97 104 | 
             
                    { MODEL_30B, 1280ull * MB },
         | 
| @@ -897,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { | |
| 897 904 |  | 
| 898 905 | 
             
            static const char *llama_model_type_name(e_model type) {
         | 
| 899 906 | 
             
                switch (type) {
         | 
| 907 | 
            +
                    case MODEL_3B: return "3B";
         | 
| 900 908 | 
             
                    case MODEL_7B: return "7B";
         | 
| 901 909 | 
             
                    case MODEL_13B: return "13B";
         | 
| 902 910 | 
             
                    case MODEL_30B: return "30B";
         | 
| @@ -930,6 +938,7 @@ static void llama_model_load_internal( | |
| 930 938 |  | 
| 931 939 | 
             
                {
         | 
| 932 940 | 
             
                    switch (hparams.n_layer) {
         | 
| 941 | 
            +
                        case 26: model.type = e_model::MODEL_3B; break;
         | 
| 933 942 | 
             
                        case 32: model.type = e_model::MODEL_7B; break;
         | 
| 934 943 | 
             
                        case 40: model.type = e_model::MODEL_13B; break;
         | 
| 935 944 | 
             
                        case 60: model.type = e_model::MODEL_30B; break;
         | 
| @@ -1092,7 +1101,7 @@ static void llama_model_load_internal( | |
| 1092 1101 | 
             
                        fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
         | 
| 1093 1102 | 
             
                    }
         | 
| 1094 1103 | 
             
                    fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
         | 
| 1095 | 
            -
            # | 
| 1104 | 
            +
            #elif !defined(GGML_USE_CLBLAST)
         | 
| 1096 1105 | 
             
                    (void) n_gpu_layers;
         | 
| 1097 1106 | 
             
            #endif
         | 
| 1098 1107 | 
             
                }
         | 
| @@ -1125,7 +1134,33 @@ static void llama_model_load_internal( | |
| 1125 1134 | 
             
                        done_size += lt.size;
         | 
| 1126 1135 | 
             
                    }
         | 
| 1127 1136 | 
             
                }
         | 
| 1128 | 
            -
            # | 
| 1137 | 
            +
            #elif defined(GGML_USE_CLBLAST)
         | 
| 1138 | 
            +
                {
         | 
| 1139 | 
            +
                    const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
         | 
| 1140 | 
            +
             | 
| 1141 | 
            +
                    fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
         | 
| 1142 | 
            +
             | 
| 1143 | 
            +
                    size_t vram_total = 0;
         | 
| 1144 | 
            +
             | 
| 1145 | 
            +
                    for (int i = 0; i < n_gpu; ++i) {
         | 
| 1146 | 
            +
                        const auto & layer = model.layers[i];
         | 
| 1147 | 
            +
             | 
| 1148 | 
            +
                        ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
         | 
| 1149 | 
            +
                        ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
         | 
| 1150 | 
            +
                        ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
         | 
| 1151 | 
            +
                        ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
         | 
| 1152 | 
            +
                        ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
         | 
| 1153 | 
            +
                        ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
         | 
| 1154 | 
            +
                        ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
         | 
| 1155 | 
            +
                    }
         | 
| 1156 | 
            +
                    if (n_gpu_layers > (int) hparams.n_layer) {
         | 
| 1157 | 
            +
                        fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
         | 
| 1158 | 
            +
                        ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
         | 
| 1159 | 
            +
                    }
         | 
| 1160 | 
            +
             | 
| 1161 | 
            +
                    fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
         | 
| 1162 | 
            +
                }
         | 
| 1163 | 
            +
            #endif
         | 
| 1129 1164 |  | 
| 1130 1165 | 
             
                if (progress_callback) {
         | 
| 1131 1166 | 
             
                    progress_callback(1.0f, progress_callback_user_data);
         | 
    
        data/ext/llama_cpp/src/llama.h
    CHANGED
    
    | @@ -31,6 +31,11 @@ | |
| 31 31 | 
             
            #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
         | 
| 32 32 | 
             
            #define LLAMA_SESSION_VERSION        1
         | 
| 33 33 |  | 
| 34 | 
            +
            #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
         | 
| 35 | 
            +
            // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
         | 
| 36 | 
            +
            #define LLAMA_SUPPORTS_GPU_OFFLOAD
         | 
| 37 | 
            +
            #endif
         | 
| 38 | 
            +
             | 
| 34 39 | 
             
            #ifdef __cplusplus
         | 
| 35 40 | 
             
            extern "C" {
         | 
| 36 41 | 
             
            #endif
         | 
    
        data/lib/llama_cpp/version.rb
    CHANGED
    
    | @@ -3,8 +3,8 @@ | |
| 3 3 | 
             
            # llama_cpp.rb provides Ruby bindings for the llama.cpp.
         | 
| 4 4 | 
             
            module LLaMACpp
         | 
| 5 5 | 
             
              # The version of llama_cpp.rb you install.
         | 
| 6 | 
            -
              VERSION = '0.1. | 
| 6 | 
            +
              VERSION = '0.1.4'
         | 
| 7 7 |  | 
| 8 8 | 
             
              # The version of llama.cpp bundled with llama_cpp.rb.
         | 
| 9 | 
            -
              LLAMA_CPP_VERSION = 'master- | 
| 9 | 
            +
              LLAMA_CPP_VERSION = 'master-ffb06a3'
         | 
| 10 10 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: llama_cpp
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.4
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - yoshoku
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2023- | 
| 11 | 
            +
            date: 2023-06-03 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies: []
         | 
| 13 13 | 
             
            description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
         | 
| 14 14 | 
             
            email:
         | 
| @@ -27,7 +27,7 @@ files: | |
| 27 27 | 
             
            - ext/llama_cpp/llama_cpp.h
         | 
| 28 28 | 
             
            - ext/llama_cpp/src/LICENSE
         | 
| 29 29 | 
             
            - ext/llama_cpp/src/ggml-cuda.h
         | 
| 30 | 
            -
            - ext/llama_cpp/src/ggml-opencl. | 
| 30 | 
            +
            - ext/llama_cpp/src/ggml-opencl.cpp
         | 
| 31 31 | 
             
            - ext/llama_cpp/src/ggml-opencl.h
         | 
| 32 32 | 
             
            - ext/llama_cpp/src/ggml.c
         | 
| 33 33 | 
             
            - ext/llama_cpp/src/ggml.h
         |