llama_cpp 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +7 -0
 - data/examples/README.md +60 -0
 - data/examples/chat.rb +195 -0
 - data/ext/llama_cpp/llama_cpp.cpp +52 -0
 - data/ext/llama_cpp/src/ggml-cuda.cu +697 -130
 - data/ext/llama_cpp/src/ggml-cuda.h +4 -1
 - data/ext/llama_cpp/src/ggml-metal.h +1 -0
 - data/ext/llama_cpp/src/ggml-metal.m +548 -497
 - data/ext/llama_cpp/src/ggml-metal.metal +425 -122
 - data/ext/llama_cpp/src/ggml-opencl.cpp +3 -32
 - data/ext/llama_cpp/src/ggml-opencl.h +1 -2
 - data/ext/llama_cpp/src/ggml.c +1904 -303
 - data/ext/llama_cpp/src/ggml.h +126 -2
 - data/ext/llama_cpp/src/llama.cpp +212 -108
 - data/ext/llama_cpp/src/llama.h +12 -3
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +3 -0
 - metadata +4 -2
 
    
        data/ext/llama_cpp/src/llama.cpp
    CHANGED
    
    | 
         @@ -165,6 +165,11 @@ struct llama_kv_cache { 
     | 
|
| 
       165 
165 
     | 
    
         
             
                    if (ctx) {
         
     | 
| 
       166 
166 
     | 
    
         
             
                        ggml_free(ctx);
         
     | 
| 
       167 
167 
     | 
    
         
             
                    }
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
      
 169 
     | 
    
         
            +
            #ifdef GGML_USE_CUBLAS
         
     | 
| 
      
 170 
     | 
    
         
            +
                    ggml_cuda_free_data(k);
         
     | 
| 
      
 171 
     | 
    
         
            +
                    ggml_cuda_free_data(v);
         
     | 
| 
      
 172 
     | 
    
         
            +
            #endif // GGML_USE_CUBLAS
         
     | 
| 
       168 
173 
     | 
    
         
             
                }
         
     | 
| 
       169 
174 
     | 
    
         
             
            };
         
     | 
| 
       170 
175 
     | 
    
         | 
| 
         @@ -210,6 +215,7 @@ struct llama_model { 
     | 
|
| 
       210 
215 
     | 
    
         
             
                    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
         
     | 
| 
       211 
216 
     | 
    
         
             
                        ggml_cuda_free_data(tensors_by_name[i].second);
         
     | 
| 
       212 
217 
     | 
    
         
             
                    }
         
     | 
| 
      
 218 
     | 
    
         
            +
                    ggml_cuda_free_scratch();
         
     | 
| 
       213 
219 
     | 
    
         
             
            #elif defined(GGML_USE_CLBLAST)
         
     | 
| 
       214 
220 
     | 
    
         
             
                    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
         
     | 
| 
       215 
221 
     | 
    
         
             
                        ggml_cl_free_data(tensors_by_name[i].second);
         
     | 
| 
         @@ -707,6 +713,9 @@ struct llama_model_loader { 
     | 
|
| 
       707 
713 
     | 
    
         | 
| 
       708 
714 
     | 
    
         
             
                struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
         
     | 
| 
       709 
715 
     | 
    
         
             
                    struct ggml_tensor * tensor;
         
     | 
| 
      
 716 
     | 
    
         
            +
                    if (backend != GGML_BACKEND_CPU) {
         
     | 
| 
      
 717 
     | 
    
         
            +
                        ggml_set_no_alloc(ggml_ctx, true);
         
     | 
| 
      
 718 
     | 
    
         
            +
                    }
         
     | 
| 
       710 
719 
     | 
    
         
             
                    if (lt.ne.size() == 2) {
         
     | 
| 
       711 
720 
     | 
    
         
             
                        tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
         
     | 
| 
       712 
721 
     | 
    
         
             
                    } else {
         
     | 
| 
         @@ -716,6 +725,9 @@ struct llama_model_loader { 
     | 
|
| 
       716 
725 
     | 
    
         
             
                    ggml_set_name(tensor, lt.name.c_str());
         
     | 
| 
       717 
726 
     | 
    
         
             
                    LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
         
     | 
| 
       718 
727 
     | 
    
         | 
| 
      
 728 
     | 
    
         
            +
                    if (backend != GGML_BACKEND_CPU) {
         
     | 
| 
      
 729 
     | 
    
         
            +
                        ggml_set_no_alloc(ggml_ctx, use_mmap);
         
     | 
| 
      
 730 
     | 
    
         
            +
                    }
         
     | 
| 
       719 
731 
     | 
    
         
             
                    tensor->backend = backend;
         
     | 
| 
       720 
732 
     | 
    
         
             
                    lt.ggml_tensor = tensor;
         
     | 
| 
       721 
733 
     | 
    
         
             
                    num_ggml_tensors_created++;
         
     | 
| 
         @@ -731,6 +743,7 @@ struct llama_model_loader { 
     | 
|
| 
       731 
743 
     | 
    
         
             
                void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
         
     | 
| 
       732 
744 
     | 
    
         
             
                    size_t data_size = 0;
         
     | 
| 
       733 
745 
     | 
    
         
             
                    size_t prefetch_size = 0;
         
     | 
| 
      
 746 
     | 
    
         
            +
                    size_t lock_size = 0;
         
     | 
| 
       734 
747 
     | 
    
         
             
                    for (const llama_load_tensor & lt : tensors_map.tensors) {
         
     | 
| 
       735 
748 
     | 
    
         
             
                        data_size += lt.size;
         
     | 
| 
       736 
749 
     | 
    
         
             
                        if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
         
     | 
| 
         @@ -740,11 +753,6 @@ struct llama_model_loader { 
     | 
|
| 
       740 
753 
     | 
    
         | 
| 
       741 
754 
     | 
    
         
             
                    if (use_mmap) {
         
     | 
| 
       742 
755 
     | 
    
         
             
                        mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
         
     | 
| 
       743 
     | 
    
         
            -
                        if (!lmlock) {
         
     | 
| 
       744 
     | 
    
         
            -
                            // Don't call the callback since the actual loading will be lazy
         
     | 
| 
       745 
     | 
    
         
            -
                            // and we can't measure it.
         
     | 
| 
       746 
     | 
    
         
            -
                            progress_callback = NULL;
         
     | 
| 
       747 
     | 
    
         
            -
                        }
         
     | 
| 
       748 
756 
     | 
    
         
             
                        if (lmlock) {
         
     | 
| 
       749 
757 
     | 
    
         
             
                            lmlock->init(mapping->addr);
         
     | 
| 
       750 
758 
     | 
    
         
             
                        }
         
     | 
| 
         @@ -752,20 +760,49 @@ struct llama_model_loader { 
     | 
|
| 
       752 
760 
     | 
    
         | 
| 
       753 
761 
     | 
    
         
             
                    size_t done_size = 0;
         
     | 
| 
       754 
762 
     | 
    
         
             
                    for (llama_load_tensor & lt : tensors_map.tensors) {
         
     | 
| 
       755 
     | 
    
         
            -
                        if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
         
     | 
| 
       756 
     | 
    
         
            -
                            continue;
         
     | 
| 
       757 
     | 
    
         
            -
                        }
         
     | 
| 
       758 
763 
     | 
    
         
             
                        if (progress_callback) {
         
     | 
| 
       759 
764 
     | 
    
         
             
                            progress_callback((float) done_size / data_size, progress_callback_user_data);
         
     | 
| 
       760 
765 
     | 
    
         
             
                        }
         
     | 
| 
       761 
766 
     | 
    
         
             
                        LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
         
     | 
| 
       762 
767 
     | 
    
         
             
                        lt.data = (uint8_t *) lt.ggml_tensor->data;
         
     | 
| 
      
 768 
     | 
    
         
            +
             
     | 
| 
      
 769 
     | 
    
         
            +
                        // allocate temp buffer if not using mmap
         
     | 
| 
      
 770 
     | 
    
         
            +
                        if (!use_mmap && lt.data == NULL) {
         
     | 
| 
      
 771 
     | 
    
         
            +
                            GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
         
     | 
| 
      
 772 
     | 
    
         
            +
                            lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
         
     | 
| 
      
 773 
     | 
    
         
            +
                        }
         
     | 
| 
      
 774 
     | 
    
         
            +
             
     | 
| 
       763 
775 
     | 
    
         
             
                        load_data_for(lt);
         
     | 
| 
       764 
     | 
    
         
            -
             
     | 
| 
       765 
     | 
    
         
            -
                         
     | 
| 
       766 
     | 
    
         
            -
             
     | 
| 
       767 
     | 
    
         
            -
             
     | 
| 
      
 776 
     | 
    
         
            +
             
     | 
| 
      
 777 
     | 
    
         
            +
                        switch(lt.ggml_tensor->backend) {
         
     | 
| 
      
 778 
     | 
    
         
            +
                            case GGML_BACKEND_CPU:
         
     | 
| 
      
 779 
     | 
    
         
            +
                                lt.ggml_tensor->data = lt.data;
         
     | 
| 
      
 780 
     | 
    
         
            +
                                if (use_mmap && lmlock) {
         
     | 
| 
      
 781 
     | 
    
         
            +
                                    lock_size += lt.size;
         
     | 
| 
      
 782 
     | 
    
         
            +
                                    lmlock->grow_to(lock_size);
         
     | 
| 
      
 783 
     | 
    
         
            +
                                }
         
     | 
| 
      
 784 
     | 
    
         
            +
                                break;
         
     | 
| 
      
 785 
     | 
    
         
            +
            #if defined(GGML_USE_CUBLAS)
         
     | 
| 
      
 786 
     | 
    
         
            +
                            case GGML_BACKEND_GPU:
         
     | 
| 
      
 787 
     | 
    
         
            +
                            case GGML_BACKEND_GPU_SPLIT:
         
     | 
| 
      
 788 
     | 
    
         
            +
                                ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
         
     | 
| 
      
 789 
     | 
    
         
            +
                                if (!use_mmap) {
         
     | 
| 
      
 790 
     | 
    
         
            +
                                    free(lt.data);
         
     | 
| 
      
 791 
     | 
    
         
            +
                                }
         
     | 
| 
      
 792 
     | 
    
         
            +
                                break;
         
     | 
| 
      
 793 
     | 
    
         
            +
            #elif defined(GGML_USE_CLBLAST)
         
     | 
| 
      
 794 
     | 
    
         
            +
                            case GGML_BACKEND_GPU:
         
     | 
| 
      
 795 
     | 
    
         
            +
                                ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
         
     | 
| 
      
 796 
     | 
    
         
            +
                                if (!use_mmap) {
         
     | 
| 
      
 797 
     | 
    
         
            +
                                    free(lt.data);
         
     | 
| 
      
 798 
     | 
    
         
            +
                                }
         
     | 
| 
      
 799 
     | 
    
         
            +
                                break;
         
     | 
| 
      
 800 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 801 
     | 
    
         
            +
                            default:
         
     | 
| 
      
 802 
     | 
    
         
            +
                                continue;
         
     | 
| 
       768 
803 
     | 
    
         
             
                        }
         
     | 
| 
      
 804 
     | 
    
         
            +
             
     | 
| 
      
 805 
     | 
    
         
            +
                        done_size += lt.size;
         
     | 
| 
       769 
806 
     | 
    
         
             
                    }
         
     | 
| 
       770 
807 
     | 
    
         
             
                }
         
     | 
| 
       771 
808 
     | 
    
         | 
| 
         @@ -836,7 +873,8 @@ static bool kv_cache_init( 
     | 
|
| 
       836 
873 
     | 
    
         
             
                    const struct llama_hparams & hparams,
         
     | 
| 
       837 
874 
     | 
    
         
             
                         struct llama_kv_cache & cache,
         
     | 
| 
       838 
875 
     | 
    
         
             
                                     ggml_type   wtype,
         
     | 
| 
       839 
     | 
    
         
            -
                                           int   n_ctx 
     | 
| 
      
 876 
     | 
    
         
            +
                                           int   n_ctx,
         
     | 
| 
      
 877 
     | 
    
         
            +
                                           int   n_gpu_layers) {
         
     | 
| 
       840 
878 
     | 
    
         
             
                const int n_embd  = hparams.n_embd;
         
     | 
| 
       841 
879 
     | 
    
         
             
                const int n_layer = hparams.n_layer;
         
     | 
| 
       842 
880 
     | 
    
         | 
| 
         @@ -862,6 +900,15 @@ static bool kv_cache_init( 
     | 
|
| 
       862 
900 
     | 
    
         
             
                ggml_set_name(cache.k, "cache_k");
         
     | 
| 
       863 
901 
     | 
    
         
             
                ggml_set_name(cache.v, "cache_v");
         
     | 
| 
       864 
902 
     | 
    
         | 
| 
      
 903 
     | 
    
         
            +
            #ifdef GGML_USE_CUBLAS
         
     | 
| 
      
 904 
     | 
    
         
            +
                if (n_gpu_layers > n_layer + 1) {
         
     | 
| 
      
 905 
     | 
    
         
            +
                    ggml_cuda_assign_buffers_no_scratch(cache.v);
         
     | 
| 
      
 906 
     | 
    
         
            +
                }
         
     | 
| 
      
 907 
     | 
    
         
            +
                if (n_gpu_layers > n_layer + 2) {
         
     | 
| 
      
 908 
     | 
    
         
            +
                    ggml_cuda_assign_buffers_no_scratch(cache.k);
         
     | 
| 
      
 909 
     | 
    
         
            +
                }
         
     | 
| 
      
 910 
     | 
    
         
            +
            #endif // GGML_USE_CUBLAS
         
     | 
| 
      
 911 
     | 
    
         
            +
             
     | 
| 
       865 
912 
     | 
    
         
             
                return true;
         
     | 
| 
       866 
913 
     | 
    
         
             
            }
         
     | 
| 
       867 
914 
     | 
    
         | 
| 
         @@ -872,6 +919,7 @@ struct llama_context_params llama_context_default_params() { 
     | 
|
| 
       872 
919 
     | 
    
         
             
                    /*.gpu_layers                  =*/ 0,
         
     | 
| 
       873 
920 
     | 
    
         
             
                    /*.main_gpu                    =*/ 0,
         
     | 
| 
       874 
921 
     | 
    
         
             
                    /*.tensor_split                =*/ {0},
         
     | 
| 
      
 922 
     | 
    
         
            +
                    /*.low_vram                    =*/ false,
         
     | 
| 
       875 
923 
     | 
    
         
             
                    /*.seed                        =*/ -1,
         
     | 
| 
       876 
924 
     | 
    
         
             
                    /*.f16_kv                      =*/ true,
         
     | 
| 
       877 
925 
     | 
    
         
             
                    /*.logits_all                  =*/ false,
         
     | 
| 
         @@ -980,6 +1028,7 @@ static void llama_model_load_internal( 
     | 
|
| 
       980 
1028 
     | 
    
         
             
                    int n_gpu_layers,
         
     | 
| 
       981 
1029 
     | 
    
         
             
                    int main_gpu,
         
     | 
| 
       982 
1030 
     | 
    
         
             
                    const float * tensor_split,
         
     | 
| 
      
 1031 
     | 
    
         
            +
                    bool low_vram,
         
     | 
| 
       983 
1032 
     | 
    
         
             
                    ggml_type memory_type,
         
     | 
| 
       984 
1033 
     | 
    
         
             
                    bool use_mmap,
         
     | 
| 
       985 
1034 
     | 
    
         
             
                    bool use_mlock,
         
     | 
| 
         @@ -1005,6 +1054,12 @@ static void llama_model_load_internal( 
     | 
|
| 
       1005 
1054 
     | 
    
         
             
                        case 40: model.type = e_model::MODEL_13B; break;
         
     | 
| 
       1006 
1055 
     | 
    
         
             
                        case 60: model.type = e_model::MODEL_30B; break;
         
     | 
| 
       1007 
1056 
     | 
    
         
             
                        case 80: model.type = e_model::MODEL_65B; break;
         
     | 
| 
      
 1057 
     | 
    
         
            +
                        default:
         
     | 
| 
      
 1058 
     | 
    
         
            +
                            {
         
     | 
| 
      
 1059 
     | 
    
         
            +
                                if (hparams.n_layer < 32) {
         
     | 
| 
      
 1060 
     | 
    
         
            +
                                    model.type = e_model::MODEL_7B;
         
     | 
| 
      
 1061 
     | 
    
         
            +
                                }
         
     | 
| 
      
 1062 
     | 
    
         
            +
                            } break;
         
     | 
| 
       1008 
1063 
     | 
    
         
             
                    }
         
     | 
| 
       1009 
1064 
     | 
    
         | 
| 
       1010 
1065 
     | 
    
         
             
                    hparams.n_ctx = n_ctx;
         
     | 
| 
         @@ -1100,18 +1155,34 @@ static void llama_model_load_internal( 
     | 
|
| 
       1100 
1155 
     | 
    
         
             
                    ml->ggml_ctx = ctx;
         
     | 
| 
       1101 
1156 
     | 
    
         | 
| 
       1102 
1157 
     | 
    
         
             
                    model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
         
     | 
| 
       1103 
     | 
    
         
            -
                    model.norm           = ml->get_tensor("norm.weight",           {n_embd},          GGML_BACKEND_CPU);
         
     | 
| 
       1104 
1158 
     | 
    
         | 
| 
       1105 
1159 
     | 
    
         
             
                    // "output" tensor
         
     | 
| 
       1106 
1160 
     | 
    
         
             
                    {
         
     | 
| 
      
 1161 
     | 
    
         
            +
                        ggml_backend backend_norm;
         
     | 
| 
       1107 
1162 
     | 
    
         
             
                        ggml_backend backend_output;
         
     | 
| 
       1108 
1163 
     | 
    
         
             
                        if (n_gpu_layers > int(n_layer)) { // NOLINT
         
     | 
| 
      
 1164 
     | 
    
         
            +
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
         
     | 
| 
      
 1165 
     | 
    
         
            +
                            // on Windows however this is detrimental unless everything is on the GPU
         
     | 
| 
      
 1166 
     | 
    
         
            +
            #ifndef _WIN32
         
     | 
| 
      
 1167 
     | 
    
         
            +
                            backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
         
     | 
| 
      
 1168 
     | 
    
         
            +
            #else
         
     | 
| 
      
 1169 
     | 
    
         
            +
                            backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
         
     | 
| 
      
 1170 
     | 
    
         
            +
            #endif // _WIN32
         
     | 
| 
      
 1171 
     | 
    
         
            +
             
     | 
| 
       1109 
1172 
     | 
    
         
             
                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
         
     | 
| 
       1110 
1173 
     | 
    
         
             
                        } else {
         
     | 
| 
      
 1174 
     | 
    
         
            +
                            backend_norm = GGML_BACKEND_CPU;
         
     | 
| 
       1111 
1175 
     | 
    
         
             
                            backend_output = GGML_BACKEND_CPU;
         
     | 
| 
       1112 
1176 
     | 
    
         
             
                        }
         
     | 
| 
       1113 
1177 
     | 
    
         | 
| 
      
 1178 
     | 
    
         
            +
                        model.norm   = ml->get_tensor("norm.weight",   {n_embd},          backend_norm);
         
     | 
| 
       1114 
1179 
     | 
    
         
             
                        model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
         
     | 
| 
      
 1180 
     | 
    
         
            +
                        if (backend_norm == GGML_BACKEND_GPU) {
         
     | 
| 
      
 1181 
     | 
    
         
            +
                            vram_weights += ggml_nbytes(model.norm);
         
     | 
| 
      
 1182 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1183 
     | 
    
         
            +
                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
         
     | 
| 
      
 1184 
     | 
    
         
            +
                            vram_weights += ggml_nbytes(model.output);
         
     | 
| 
      
 1185 
     | 
    
         
            +
                        }
         
     | 
| 
       1115 
1186 
     | 
    
         
             
                    }
         
     | 
| 
       1116 
1187 
     | 
    
         | 
| 
       1117 
1188 
     | 
    
         
             
                    const int i_gpu_start = n_layer - n_gpu_layers;
         
     | 
| 
         @@ -1141,7 +1212,7 @@ static void llama_model_load_internal( 
     | 
|
| 
       1141 
1212 
     | 
    
         
             
                        if (backend == GGML_BACKEND_GPU) {
         
     | 
| 
       1142 
1213 
     | 
    
         
             
                            vram_weights +=
         
     | 
| 
       1143 
1214 
     | 
    
         
             
                                ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)             +
         
     | 
| 
       1144 
     | 
    
         
            -
                                ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer. 
     | 
| 
      
 1215 
     | 
    
         
            +
                                ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
         
     | 
| 
       1145 
1216 
     | 
    
         
             
                                ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
         
     | 
| 
       1146 
1217 
     | 
    
         
             
                        }
         
     | 
| 
       1147 
1218 
     | 
    
         
             
                    }
         
     | 
| 
         @@ -1169,23 +1240,49 @@ static void llama_model_load_internal( 
     | 
|
| 
       1169 
1240 
     | 
    
         
             
                            mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
         
     | 
| 
       1170 
1241 
     | 
    
         | 
| 
       1171 
1242 
     | 
    
         
             
                    (void) vram_scratch;
         
     | 
| 
      
 1243 
     | 
    
         
            +
                    (void) n_batch;
         
     | 
| 
       1172 
1244 
     | 
    
         
             
            #ifdef GGML_USE_CUBLAS
         
     | 
| 
       1173 
     | 
    
         
            -
                     
     | 
| 
       1174 
     | 
    
         
            -
             
     | 
| 
       1175 
     | 
    
         
            -
             
     | 
| 
       1176 
     | 
    
         
            -
             
     | 
| 
       1177 
     | 
    
         
            -
             
     | 
| 
      
 1245 
     | 
    
         
            +
                    if (low_vram) {
         
     | 
| 
      
 1246 
     | 
    
         
            +
                        fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
         
     | 
| 
      
 1247 
     | 
    
         
            +
                        ggml_cuda_set_scratch_size(0); // disable scratch
         
     | 
| 
      
 1248 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 1249 
     | 
    
         
            +
                        vram_scratch = n_batch * MB;
         
     | 
| 
      
 1250 
     | 
    
         
            +
                        ggml_cuda_set_scratch_size(vram_scratch);
         
     | 
| 
      
 1251 
     | 
    
         
            +
                        if (n_gpu_layers > 0) {
         
     | 
| 
      
 1252 
     | 
    
         
            +
                            fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
         
     | 
| 
      
 1253 
     | 
    
         
            +
                                    __func__, vram_scratch / MB);
         
     | 
| 
      
 1254 
     | 
    
         
            +
                        }
         
     | 
| 
       1178 
1255 
     | 
    
         
             
                    }
         
     | 
| 
       1179 
1256 
     | 
    
         
             
            #endif // GGML_USE_CUBLAS
         
     | 
| 
       1180 
1257 
     | 
    
         
             
            #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
         
     | 
| 
       1181 
1258 
     | 
    
         
             
                    const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
         
     | 
| 
       1182 
1259 
     | 
    
         | 
| 
       1183 
     | 
    
         
            -
                    fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
         
     | 
| 
      
 1260 
     | 
    
         
            +
                    fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
         
     | 
| 
       1184 
1261 
     | 
    
         
             
                    if (n_gpu_layers > (int) hparams.n_layer) {
         
     | 
| 
       1185 
     | 
    
         
            -
                        fprintf(stderr, "%s: offloading  
     | 
| 
      
 1262 
     | 
    
         
            +
                        fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
         
     | 
| 
       1186 
1263 
     | 
    
         
             
                    }
         
     | 
| 
      
 1264 
     | 
    
         
            +
                    size_t vram_kv_cache = 0;
         
     | 
| 
      
 1265 
     | 
    
         
            +
                    if (n_gpu_layers > (int) hparams.n_layer + 1) {
         
     | 
| 
      
 1266 
     | 
    
         
            +
                        if (low_vram) {
         
     | 
| 
      
 1267 
     | 
    
         
            +
                            fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
         
     | 
| 
      
 1268 
     | 
    
         
            +
                        } else {
         
     | 
| 
      
 1269 
     | 
    
         
            +
                            fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
         
     | 
| 
      
 1270 
     | 
    
         
            +
                            vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
         
     | 
| 
      
 1271 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1272 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1273 
     | 
    
         
            +
                    if (n_gpu_layers > (int) hparams.n_layer + 2) {
         
     | 
| 
      
 1274 
     | 
    
         
            +
                        if (low_vram) {
         
     | 
| 
      
 1275 
     | 
    
         
            +
                            fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
         
     | 
| 
      
 1276 
     | 
    
         
            +
                        } else {
         
     | 
| 
      
 1277 
     | 
    
         
            +
                            fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
         
     | 
| 
      
 1278 
     | 
    
         
            +
                            vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
         
     | 
| 
      
 1279 
     | 
    
         
            +
                        }
         
     | 
| 
      
 1280 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1281 
     | 
    
         
            +
                    const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
         
     | 
| 
      
 1282 
     | 
    
         
            +
                    fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
         
     | 
| 
      
 1283 
     | 
    
         
            +
                            __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
         
     | 
| 
       1187 
1284 
     | 
    
         
             
                    fprintf(stderr, "%s: total VRAM used: %zu MB\n",
         
     | 
| 
       1188 
     | 
    
         
            -
                            __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
         
     | 
| 
      
 1285 
     | 
    
         
            +
                            __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
         
     | 
| 
       1189 
1286 
     | 
    
         
             
            #else
         
     | 
| 
       1190 
1287 
     | 
    
         
             
                    (void) n_gpu_layers;
         
     | 
| 
       1191 
1288 
     | 
    
         
             
            #endif
         
     | 
| 
         @@ -1196,58 +1293,15 @@ static void llama_model_load_internal( 
     | 
|
| 
       1196 
1293 
     | 
    
         
             
                    model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
         
     | 
| 
       1197 
1294 
     | 
    
         
             
                }
         
     | 
| 
       1198 
1295 
     | 
    
         | 
| 
       1199 
     | 
    
         
            -
                 
     | 
| 
       1200 
     | 
    
         
            -
             
     | 
| 
      
 1296 
     | 
    
         
            +
                (void) tensor_split;
         
     | 
| 
       1201 
1297 
     | 
    
         
             
            #if defined(GGML_USE_CUBLAS)
         
     | 
| 
       1202 
1298 
     | 
    
         
             
                {
         
     | 
| 
       1203 
1299 
     | 
    
         
             
                    ggml_cuda_set_tensor_split(tensor_split);
         
     | 
| 
       1204 
     | 
    
         
            -
             
     | 
| 
       1205 
     | 
    
         
            -
                    size_t done_size = 0;
         
     | 
| 
       1206 
     | 
    
         
            -
                    size_t data_size = 0;
         
     | 
| 
       1207 
     | 
    
         
            -
                    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
         
     | 
| 
       1208 
     | 
    
         
            -
                        data_size += lt.size;
         
     | 
| 
       1209 
     | 
    
         
            -
                        if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
         
     | 
| 
       1210 
     | 
    
         
            -
                            done_size += lt.size;
         
     | 
| 
       1211 
     | 
    
         
            -
                        }
         
     | 
| 
       1212 
     | 
    
         
            -
                    }
         
     | 
| 
       1213 
     | 
    
         
            -
                    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
         
     | 
| 
       1214 
     | 
    
         
            -
                        ggml_backend backend = lt.ggml_tensor->backend;
         
     | 
| 
       1215 
     | 
    
         
            -
                        if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
         
     | 
| 
       1216 
     | 
    
         
            -
                            continue;
         
     | 
| 
       1217 
     | 
    
         
            -
                        }
         
     | 
| 
       1218 
     | 
    
         
            -
                        if (progress_callback) {
         
     | 
| 
       1219 
     | 
    
         
            -
                            progress_callback((float) done_size / data_size, progress_callback_user_data);
         
     | 
| 
       1220 
     | 
    
         
            -
                        }
         
     | 
| 
       1221 
     | 
    
         
            -
                        ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
         
     | 
| 
       1222 
     | 
    
         
            -
                        done_size += lt.size;
         
     | 
| 
       1223 
     | 
    
         
            -
                    }
         
     | 
| 
       1224 
1300 
     | 
    
         
             
                }
         
     | 
| 
       1225 
     | 
    
         
            -
            #elif defined(GGML_USE_CLBLAST)
         
     | 
| 
       1226 
     | 
    
         
            -
                {
         
     | 
| 
       1227 
     | 
    
         
            -
                    size_t done_size = 0;
         
     | 
| 
       1228 
     | 
    
         
            -
                    size_t data_size = 0;
         
     | 
| 
       1229 
     | 
    
         
            -
                    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
         
     | 
| 
       1230 
     | 
    
         
            -
                        data_size += lt.size;
         
     | 
| 
       1231 
     | 
    
         
            -
                        if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
         
     | 
| 
       1232 
     | 
    
         
            -
                            done_size += lt.size;
         
     | 
| 
       1233 
     | 
    
         
            -
                        }
         
     | 
| 
       1234 
     | 
    
         
            -
                    }
         
     | 
| 
       1235 
     | 
    
         
            -
                    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
         
     | 
| 
       1236 
     | 
    
         
            -
                        if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
         
     | 
| 
       1237 
     | 
    
         
            -
                            continue;
         
     | 
| 
       1238 
     | 
    
         
            -
                        }
         
     | 
| 
       1239 
     | 
    
         
            -
                        if (progress_callback) {
         
     | 
| 
       1240 
     | 
    
         
            -
                            progress_callback((float) done_size / data_size, progress_callback_user_data);
         
     | 
| 
       1241 
     | 
    
         
            -
                        }
         
     | 
| 
       1242 
     | 
    
         
            -
                        ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
         
     | 
| 
       1243 
     | 
    
         
            -
                        done_size += lt.size;
         
     | 
| 
       1244 
     | 
    
         
            -
                    }
         
     | 
| 
       1245 
     | 
    
         
            -
                }
         
     | 
| 
       1246 
     | 
    
         
            -
            #else
         
     | 
| 
       1247 
     | 
    
         
            -
                (void) n_batch;
         
     | 
| 
       1248 
     | 
    
         
            -
                (void) tensor_split;
         
     | 
| 
       1249 
1301 
     | 
    
         
             
            #endif
         
     | 
| 
       1250 
1302 
     | 
    
         | 
| 
      
 1303 
     | 
    
         
            +
                ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
         
     | 
| 
      
 1304 
     | 
    
         
            +
             
     | 
| 
       1251 
1305 
     | 
    
         
             
                if (progress_callback) {
         
     | 
| 
       1252 
1306 
     | 
    
         
             
                    progress_callback(1.0f, progress_callback_user_data);
         
     | 
| 
       1253 
1307 
     | 
    
         
             
                }
         
     | 
| 
         @@ -1267,6 +1321,7 @@ static bool llama_model_load( 
     | 
|
| 
       1267 
1321 
     | 
    
         
             
                    int n_gpu_layers,
         
     | 
| 
       1268 
1322 
     | 
    
         
             
                    int main_gpu,
         
     | 
| 
       1269 
1323 
     | 
    
         
             
                    float * tensor_split,
         
     | 
| 
      
 1324 
     | 
    
         
            +
                    bool low_vram,
         
     | 
| 
       1270 
1325 
     | 
    
         
             
                    ggml_type memory_type,
         
     | 
| 
       1271 
1326 
     | 
    
         
             
                    bool use_mmap,
         
     | 
| 
       1272 
1327 
     | 
    
         
             
                    bool use_mlock,
         
     | 
| 
         @@ -1274,7 +1329,7 @@ static bool llama_model_load( 
     | 
|
| 
       1274 
1329 
     | 
    
         
             
                    llama_progress_callback progress_callback,
         
     | 
| 
       1275 
1330 
     | 
    
         
             
                    void *progress_callback_user_data) {
         
     | 
| 
       1276 
1331 
     | 
    
         
             
                try {
         
     | 
| 
       1277 
     | 
    
         
            -
                    llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
         
     | 
| 
      
 1332 
     | 
    
         
            +
                    llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
         
     | 
| 
       1278 
1333 
     | 
    
         
             
                                              use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
         
     | 
| 
       1279 
1334 
     | 
    
         
             
                    return true;
         
     | 
| 
       1280 
1335 
     | 
    
         
             
                } catch (const std::exception & err) {
         
     | 
| 
         @@ -1350,12 +1405,33 @@ static bool llama_eval_internal( 
     | 
|
| 
       1350 
1405 
     | 
    
         
             
                const int i_gpu_start = n_layer - n_gpu_layers;
         
     | 
| 
       1351 
1406 
     | 
    
         
             
                (void) i_gpu_start;
         
     | 
| 
       1352 
1407 
     | 
    
         | 
| 
      
 1408 
     | 
    
         
            +
                // offload functions set the tensor output backend to GPU
         
     | 
| 
      
 1409 
     | 
    
         
            +
                // tensors are GPU-accelerated if any input or the output has been offloaded
         
     | 
| 
      
 1410 
     | 
    
         
            +
                //
         
     | 
| 
      
 1411 
     | 
    
         
            +
                // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
         
     | 
| 
      
 1412 
     | 
    
         
            +
                // in that case ggml_cuda_assign_buffers has no effect
         
     | 
| 
      
 1413 
     | 
    
         
            +
                offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
         
     | 
| 
      
 1414 
     | 
    
         
            +
                offload_func_t offload_func_kq = llama_nop;
         
     | 
| 
      
 1415 
     | 
    
         
            +
                offload_func_t offload_func_v  = llama_nop;
         
     | 
| 
      
 1416 
     | 
    
         
            +
             
     | 
| 
      
 1417 
     | 
    
         
            +
            #ifdef GGML_USE_CUBLAS
         
     | 
| 
      
 1418 
     | 
    
         
            +
                    if (n_gpu_layers > n_layer) {
         
     | 
| 
      
 1419 
     | 
    
         
            +
                        offload_func_nr = ggml_cuda_assign_buffers;
         
     | 
| 
      
 1420 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1421 
     | 
    
         
            +
                    if (n_gpu_layers > n_layer + 1) {
         
     | 
| 
      
 1422 
     | 
    
         
            +
                        offload_func_v  = ggml_cuda_assign_buffers;
         
     | 
| 
      
 1423 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1424 
     | 
    
         
            +
                    if (n_gpu_layers > n_layer + 2) {
         
     | 
| 
      
 1425 
     | 
    
         
            +
                        offload_func_kq = ggml_cuda_assign_buffers;
         
     | 
| 
      
 1426 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1427 
     | 
    
         
            +
            #endif // GGML_USE_CUBLAS
         
     | 
| 
      
 1428 
     | 
    
         
            +
             
     | 
| 
       1353 
1429 
     | 
    
         
             
                for (int il = 0; il < n_layer; ++il) {
         
     | 
| 
       1354 
1430 
     | 
    
         
             
                    offload_func_t offload_func = llama_nop;
         
     | 
| 
       1355 
1431 
     | 
    
         | 
| 
       1356 
1432 
     | 
    
         
             
            #ifdef GGML_USE_CUBLAS
         
     | 
| 
       1357 
1433 
     | 
    
         
             
                    if (il >= i_gpu_start) {
         
     | 
| 
       1358 
     | 
    
         
            -
                        offload_func = ggml_cuda_assign_buffers; 
     | 
| 
      
 1434 
     | 
    
         
            +
                        offload_func = ggml_cuda_assign_buffers;
         
     | 
| 
       1359 
1435 
     | 
    
         
             
                    }
         
     | 
| 
       1360 
1436 
     | 
    
         
             
            #endif // GGML_USE_CUBLAS
         
     | 
| 
       1361 
1437 
     | 
    
         | 
| 
         @@ -1378,31 +1454,42 @@ static bool llama_eval_internal( 
     | 
|
| 
       1378 
1454 
     | 
    
         
             
                    // self-attention
         
     | 
| 
       1379 
1455 
     | 
    
         
             
                    {
         
     | 
| 
       1380 
1456 
     | 
    
         
             
                        // compute Q and K and RoPE them
         
     | 
| 
       1381 
     | 
    
         
            -
                        struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
         
     | 
| 
       1382 
     | 
    
         
            -
                        // offload_func(tmpq);
         
     | 
| 
       1383 
     | 
    
         
            -
                        ggml_set_name(tmpq, "tmpq");
         
     | 
| 
       1384 
     | 
    
         
            -
             
     | 
| 
       1385 
1457 
     | 
    
         
             
                        struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
         
     | 
| 
       1386 
     | 
    
         
            -
                         
     | 
| 
      
 1458 
     | 
    
         
            +
                        offload_func_kq(tmpk);
         
     | 
| 
       1387 
1459 
     | 
    
         
             
                        ggml_set_name(tmpk, "tmpk");
         
     | 
| 
       1388 
1460 
     | 
    
         | 
| 
      
 1461 
     | 
    
         
            +
                        struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
         
     | 
| 
      
 1462 
     | 
    
         
            +
                        offload_func_kq(tmpq);
         
     | 
| 
      
 1463 
     | 
    
         
            +
                        ggml_set_name(tmpq, "tmpq");
         
     | 
| 
      
 1464 
     | 
    
         
            +
             
     | 
| 
       1389 
1465 
     | 
    
         
             
                        struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
         
     | 
| 
      
 1466 
     | 
    
         
            +
                        offload_func_kq(Kcur);
         
     | 
| 
       1390 
1467 
     | 
    
         
             
                        ggml_set_name(Kcur, "Kcur");
         
     | 
| 
       1391 
1468 
     | 
    
         | 
| 
       1392 
1469 
     | 
    
         
             
                        struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
         
     | 
| 
      
 1470 
     | 
    
         
            +
                        offload_func_kq(Qcur);
         
     | 
| 
       1393 
1471 
     | 
    
         
             
                        ggml_set_name(Qcur, "Qcur");
         
     | 
| 
       1394 
1472 
     | 
    
         | 
| 
       1395 
1473 
     | 
    
         
             
                        // store key and value to memory
         
     | 
| 
       1396 
1474 
     | 
    
         
             
                        {
         
     | 
| 
       1397 
1475 
     | 
    
         
             
                            // compute the transposed [N, n_embd] V matrix
         
     | 
| 
       1398 
     | 
    
         
            -
             
     | 
| 
      
 1476 
     | 
    
         
            +
             
     | 
| 
      
 1477 
     | 
    
         
            +
                            struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
         
     | 
| 
      
 1478 
     | 
    
         
            +
                            offload_func_v(tmpv);
         
     | 
| 
      
 1479 
     | 
    
         
            +
                            ggml_set_name(tmpv, "tmpv");
         
     | 
| 
      
 1480 
     | 
    
         
            +
             
     | 
| 
      
 1481 
     | 
    
         
            +
                            struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
         
     | 
| 
      
 1482 
     | 
    
         
            +
                            offload_func_v(Vcur);
         
     | 
| 
       1399 
1483 
     | 
    
         
             
                            ggml_set_name(Vcur, "Vcur");
         
     | 
| 
       1400 
1484 
     | 
    
         | 
| 
       1401 
1485 
     | 
    
         
             
                            struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
         
     | 
| 
      
 1486 
     | 
    
         
            +
                            offload_func_kq(k);
         
     | 
| 
       1402 
1487 
     | 
    
         
             
                            ggml_set_name(k, "k");
         
     | 
| 
      
 1488 
     | 
    
         
            +
             
     | 
| 
       1403 
1489 
     | 
    
         
             
                            struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
         
     | 
| 
       1404 
1490 
     | 
    
         
             
                                    (   n_ctx)*ggml_element_size(kv_self.v),
         
     | 
| 
       1405 
1491 
     | 
    
         
             
                                    (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
         
     | 
| 
      
 1492 
     | 
    
         
            +
                            offload_func_v(v);
         
     | 
| 
       1406 
1493 
     | 
    
         
             
                            ggml_set_name(v, "v");
         
     | 
| 
       1407 
1494 
     | 
    
         | 
| 
       1408 
1495 
     | 
    
         
             
                            // important: storing RoPE-ed version of K in the KV cache!
         
     | 
| 
         @@ -1414,6 +1501,7 @@ static bool llama_eval_internal( 
     | 
|
| 
       1414 
1501 
     | 
    
         
             
                            ggml_permute(ctx0,
         
     | 
| 
       1415 
1502 
     | 
    
         
             
                                    Qcur,
         
     | 
| 
       1416 
1503 
     | 
    
         
             
                                    0, 2, 1, 3);
         
     | 
| 
      
 1504 
     | 
    
         
            +
                        offload_func_kq(Q);
         
     | 
| 
       1417 
1505 
     | 
    
         
             
                        ggml_set_name(Q, "Q");
         
     | 
| 
       1418 
1506 
     | 
    
         | 
| 
       1419 
1507 
     | 
    
         
             
                        struct ggml_tensor * K =
         
     | 
| 
         @@ -1422,10 +1510,12 @@ static bool llama_eval_internal( 
     | 
|
| 
       1422 
1510 
     | 
    
         
             
                                        ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
         
     | 
| 
       1423 
1511 
     | 
    
         
             
                                        n_embd/n_head, n_head, n_past + N),
         
     | 
| 
       1424 
1512 
     | 
    
         
             
                                    0, 2, 1, 3);
         
     | 
| 
      
 1513 
     | 
    
         
            +
                        offload_func_kq(K);
         
     | 
| 
       1425 
1514 
     | 
    
         
             
                        ggml_set_name(K, "K");
         
     | 
| 
       1426 
1515 
     | 
    
         | 
| 
       1427 
1516 
     | 
    
         
             
                        // K * Q
         
     | 
| 
       1428 
1517 
     | 
    
         
             
                        struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
         
     | 
| 
      
 1518 
     | 
    
         
            +
                        offload_func_kq(KQ);
         
     | 
| 
       1429 
1519 
     | 
    
         
             
                        ggml_set_name(KQ, "KQ");
         
     | 
| 
       1430 
1520 
     | 
    
         | 
| 
       1431 
1521 
     | 
    
         
             
                        // KQ_scaled = KQ / sqrt(n_embd/n_head)
         
     | 
| 
         @@ -1434,14 +1524,17 @@ static bool llama_eval_internal( 
     | 
|
| 
       1434 
1524 
     | 
    
         | 
| 
       1435 
1525 
     | 
    
         
             
                        // KQ_scaled shape [n_past + N, N, n_head, 1]
         
     | 
| 
       1436 
1526 
     | 
    
         
             
                        struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
         
     | 
| 
      
 1527 
     | 
    
         
            +
                        offload_func_kq(KQ_scaled);
         
     | 
| 
       1437 
1528 
     | 
    
         
             
                        ggml_set_name(KQ_scaled, "KQ_scaled");
         
     | 
| 
       1438 
1529 
     | 
    
         | 
| 
       1439 
1530 
     | 
    
         
             
                        // KQ_masked = mask_past(KQ_scaled)
         
     | 
| 
       1440 
1531 
     | 
    
         
             
                        struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
         
     | 
| 
      
 1532 
     | 
    
         
            +
                        offload_func_kq(KQ_masked);
         
     | 
| 
       1441 
1533 
     | 
    
         
             
                        ggml_set_name(KQ_masked, "KQ_masked");
         
     | 
| 
       1442 
1534 
     | 
    
         | 
| 
       1443 
1535 
     | 
    
         
             
                        // KQ = soft_max(KQ_masked)
         
     | 
| 
       1444 
1536 
     | 
    
         
             
                        struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
         
     | 
| 
      
 1537 
     | 
    
         
            +
                        offload_func_v(KQ_soft_max);
         
     | 
| 
       1445 
1538 
     | 
    
         
             
                        ggml_set_name(KQ_soft_max, "KQ_soft_max");
         
     | 
| 
       1446 
1539 
     | 
    
         | 
| 
       1447 
1540 
     | 
    
         
             
                        // split cached V into n_head heads
         
     | 
| 
         @@ -1451,10 +1544,12 @@ static bool llama_eval_internal( 
     | 
|
| 
       1451 
1544 
     | 
    
         
             
                                    n_ctx*ggml_element_size(kv_self.v),
         
     | 
| 
       1452 
1545 
     | 
    
         
             
                                    n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
         
     | 
| 
       1453 
1546 
     | 
    
         
             
                                    il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
         
     | 
| 
      
 1547 
     | 
    
         
            +
                        offload_func_v(V);
         
     | 
| 
       1454 
1548 
     | 
    
         
             
                        ggml_set_name(V, "V");
         
     | 
| 
       1455 
1549 
     | 
    
         | 
| 
       1456 
1550 
     | 
    
         
             
            #if 1
         
     | 
| 
       1457 
1551 
     | 
    
         
             
                        struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
         
     | 
| 
      
 1552 
     | 
    
         
            +
                        offload_func_v(KQV);
         
     | 
| 
       1458 
1553 
     | 
    
         
             
                        ggml_set_name(KQV, "KQV");
         
     | 
| 
       1459 
1554 
     | 
    
         
             
            #else
         
     | 
| 
       1460 
1555 
     | 
    
         
             
                        // make V contiguous in memory to speed up the matmul, however we waste time on the copy
         
     | 
| 
         @@ -1466,12 +1561,14 @@ static bool llama_eval_internal( 
     | 
|
| 
       1466 
1561 
     | 
    
         | 
| 
       1467 
1562 
     | 
    
         
             
                        // KQV_merged = KQV.permute(0, 2, 1, 3)
         
     | 
| 
       1468 
1563 
     | 
    
         
             
                        struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
         
     | 
| 
      
 1564 
     | 
    
         
            +
                        offload_func_v(KQV_merged);
         
     | 
| 
       1469 
1565 
     | 
    
         
             
                        ggml_set_name(KQV_merged, "KQV_merged");
         
     | 
| 
       1470 
1566 
     | 
    
         | 
| 
       1471 
1567 
     | 
    
         
             
                        // cur = KQV_merged.contiguous().view(n_embd, N)
         
     | 
| 
       1472 
1568 
     | 
    
         
             
                        cur = ggml_cpy(ctx0,
         
     | 
| 
       1473 
1569 
     | 
    
         
             
                                KQV_merged,
         
     | 
| 
       1474 
1570 
     | 
    
         
             
                                ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
         
     | 
| 
      
 1571 
     | 
    
         
            +
                        offload_func_v(cur);
         
     | 
| 
       1475 
1572 
     | 
    
         
             
                        ggml_set_name(cur, "KQV_merged_contiguous");
         
     | 
| 
       1476 
1573 
     | 
    
         | 
| 
       1477 
1574 
     | 
    
         
             
                        // projection (no bias)
         
     | 
| 
         @@ -1483,7 +1580,6 @@ static bool llama_eval_internal( 
     | 
|
| 
       1483 
1580 
     | 
    
         
             
                    }
         
     | 
| 
       1484 
1581 
     | 
    
         | 
| 
       1485 
1582 
     | 
    
         
             
                    lctx.use_buf(ctx0, 1);
         
     | 
| 
       1486 
     | 
    
         
            -
                    //ggml_cuda_set_scratch(1);
         
     | 
| 
       1487 
1583 
     | 
    
         | 
| 
       1488 
1584 
     | 
    
         
             
                    struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       1489 
1585 
     | 
    
         
             
                    offload_func(inpFF);
         
     | 
| 
         @@ -1541,32 +1637,24 @@ static bool llama_eval_internal( 
     | 
|
| 
       1541 
1637 
     | 
    
         
             
                }
         
     | 
| 
       1542 
1638 
     | 
    
         | 
| 
       1543 
1639 
     | 
    
         
             
                lctx.use_buf(ctx0, 0);
         
     | 
| 
       1544 
     | 
    
         
            -
                //ggml_cuda_set_scratch(0);
         
     | 
| 
       1545 
1640 
     | 
    
         | 
| 
       1546 
1641 
     | 
    
         
             
                // used at the end to optionally extract the embeddings
         
     | 
| 
       1547 
1642 
     | 
    
         
             
                struct ggml_tensor * embeddings = NULL;
         
     | 
| 
       1548 
1643 
     | 
    
         | 
| 
       1549 
     | 
    
         
            -
                offload_func_t offload_func = llama_nop;
         
     | 
| 
       1550 
     | 
    
         
            -
             
     | 
| 
       1551 
     | 
    
         
            -
            #ifdef GGML_USE_CUBLAS
         
     | 
| 
       1552 
     | 
    
         
            -
                    if (n_gpu_layers > n_layer) {
         
     | 
| 
       1553 
     | 
    
         
            -
                        offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
         
     | 
| 
       1554 
     | 
    
         
            -
                    }
         
     | 
| 
       1555 
     | 
    
         
            -
            #endif // GGML_USE_CUBLAS
         
     | 
| 
       1556 
1644 
     | 
    
         | 
| 
       1557 
1645 
     | 
    
         
             
                // norm
         
     | 
| 
       1558 
1646 
     | 
    
         
             
                {
         
     | 
| 
       1559 
1647 
     | 
    
         
             
                    cur = ggml_rms_norm(ctx0, inpL);
         
     | 
| 
       1560 
     | 
    
         
            -
                     
     | 
| 
      
 1648 
     | 
    
         
            +
                    offload_func_nr(cur);
         
     | 
| 
       1561 
1649 
     | 
    
         
             
                    ggml_set_name(cur, "rms_norm_inpL");
         
     | 
| 
       1562 
1650 
     | 
    
         | 
| 
       1563 
1651 
     | 
    
         
             
                    cur = ggml_rms_norm(ctx0, cur);
         
     | 
| 
       1564 
     | 
    
         
            -
                     
     | 
| 
      
 1652 
     | 
    
         
            +
                    offload_func_nr(cur);
         
     | 
| 
       1565 
1653 
     | 
    
         
             
                    ggml_set_name(cur, "rms_norm_after");
         
     | 
| 
       1566 
1654 
     | 
    
         | 
| 
       1567 
1655 
     | 
    
         
             
                    // cur = cur*norm(broadcasted)
         
     | 
| 
       1568 
1656 
     | 
    
         
             
                    cur = ggml_mul(ctx0, cur, model.norm);
         
     | 
| 
       1569 
     | 
    
         
            -
                     
     | 
| 
      
 1657 
     | 
    
         
            +
                    offload_func_nr(cur);
         
     | 
| 
       1570 
1658 
     | 
    
         
             
                    ggml_set_name(cur, "result_norm");
         
     | 
| 
       1571 
1659 
     | 
    
         | 
| 
       1572 
1660 
     | 
    
         
             
                    embeddings = cur;
         
     | 
| 
         @@ -2174,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok 
     | 
|
| 
       2174 
2262 
     | 
    
         
             
                    return -log2f(candidate.p) > *mu;
         
     | 
| 
       2175 
2263 
     | 
    
         
             
                }));
         
     | 
| 
       2176 
2264 
     | 
    
         | 
| 
      
 2265 
     | 
    
         
            +
                if (candidates->size == 0) {
         
     | 
| 
      
 2266 
     | 
    
         
            +
                    candidates->size = 1;
         
     | 
| 
      
 2267 
     | 
    
         
            +
                }
         
     | 
| 
      
 2268 
     | 
    
         
            +
             
     | 
| 
       2177 
2269 
     | 
    
         
             
                // Normalize the probabilities of the remaining words
         
     | 
| 
       2178 
2270 
     | 
    
         
             
                llama_sample_softmax(ctx, candidates);
         
     | 
| 
       2179 
2271 
     | 
    
         | 
| 
         @@ -2311,7 +2403,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       2311 
2403 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
         
     | 
| 
       2312 
2404 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
         
     | 
| 
       2313 
2405 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
         
     | 
| 
      
 2406 
     | 
    
         
            +
                    case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
         
     | 
| 
      
 2407 
     | 
    
         
            +
                    case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
         
     | 
| 
       2314 
2408 
     | 
    
         | 
| 
      
 2409 
     | 
    
         
            +
            #ifdef GGML_USE_K_QUANTS
         
     | 
| 
       2315 
2410 
     | 
    
         
             
                    // K-quants
         
     | 
| 
       2316 
2411 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
         
     | 
| 
       2317 
2412 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_Q3_K_S:
         
     | 
| 
         @@ -2322,6 +2417,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       2322 
2417 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_Q5_K_S:
         
     | 
| 
       2323 
2418 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
         
     | 
| 
       2324 
2419 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
         
     | 
| 
      
 2420 
     | 
    
         
            +
            #endif
         
     | 
| 
       2325 
2421 
     | 
    
         
             
                    default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
         
     | 
| 
       2326 
2422 
     | 
    
         
             
                }
         
     | 
| 
       2327 
2423 
     | 
    
         | 
| 
         @@ -2333,6 +2429,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       2333 
2429 
     | 
    
         
             
                                                                                        /*vocab_only*/ false));
         
     | 
| 
       2334 
2430 
     | 
    
         
             
                llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
         
     | 
| 
       2335 
2431 
     | 
    
         | 
| 
      
 2432 
     | 
    
         
            +
            #ifdef GGML_USE_K_QUANTS
         
     | 
| 
       2336 
2433 
     | 
    
         
             
                int n_attention_wv    = 0;
         
     | 
| 
       2337 
2434 
     | 
    
         
             
                int n_feed_forward_w2 = 0;
         
     | 
| 
       2338 
2435 
     | 
    
         
             
                for (auto& tensor : model_loader->tensors_map.tensors) {
         
     | 
| 
         @@ -2346,6 +2443,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       2346 
2443 
     | 
    
         | 
| 
       2347 
2444 
     | 
    
         
             
                int i_attention_wv = 0;
         
     | 
| 
       2348 
2445 
     | 
    
         
             
                int i_feed_forward_w2 = 0;
         
     | 
| 
      
 2446 
     | 
    
         
            +
            #endif
         
     | 
| 
       2349 
2447 
     | 
    
         | 
| 
       2350 
2448 
     | 
    
         
             
                size_t total_size_org = 0;
         
     | 
| 
       2351 
2449 
     | 
    
         
             
                size_t total_size_new = 0;
         
     | 
| 
         @@ -2371,12 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       2371 
2469 
     | 
    
         | 
| 
       2372 
2470 
     | 
    
         
             
                    // quantize only 2D tensors
         
     | 
| 
       2373 
2471 
     | 
    
         
             
                    quantize &= (tensor.ne.size() == 2);
         
     | 
| 
       2374 
     | 
    
         
            -
             
     | 
| 
       2375 
     | 
    
         
            -
                     
     | 
| 
       2376 
     | 
    
         
            -
                    if (!params->quantize_output_tensor && tensor.name == "output.weight") {
         
     | 
| 
       2377 
     | 
    
         
            -
                       quantize = false;
         
     | 
| 
       2378 
     | 
    
         
            -
                    }
         
     | 
| 
       2379 
     | 
    
         
            -
                    quantize = quantize && quantized_type != tensor.type;
         
     | 
| 
      
 2472 
     | 
    
         
            +
                    quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
         
     | 
| 
      
 2473 
     | 
    
         
            +
                    quantize &= quantized_type != tensor.type;
         
     | 
| 
       2380 
2474 
     | 
    
         | 
| 
       2381 
2475 
     | 
    
         
             
                    enum ggml_type new_type;
         
     | 
| 
       2382 
2476 
     | 
    
         
             
                    void * new_data;
         
     | 
| 
         @@ -2390,31 +2484,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       2390 
2484 
     | 
    
         
             
                        printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
         
     | 
| 
       2391 
2485 
     | 
    
         
             
                    } else {
         
     | 
| 
       2392 
2486 
     | 
    
         
             
                        new_type = quantized_type;
         
     | 
| 
       2393 
     | 
    
         
            -
             
     | 
| 
       2394 
     | 
    
         
            -
                         
     | 
| 
       2395 
     | 
    
         
            -
             
     | 
| 
       2396 
     | 
    
         
            -
                         
     | 
| 
       2397 
     | 
    
         
            -
                        //}
         
     | 
| 
       2398 
     | 
    
         
            -
                        if (tensor.name.find("attention.wv.weight") != std::string::npos) {
         
     | 
| 
      
 2487 
     | 
    
         
            +
            #ifdef GGML_USE_K_QUANTS
         
     | 
| 
      
 2488 
     | 
    
         
            +
                        if (tensor.name == "output.weight") {
         
     | 
| 
      
 2489 
     | 
    
         
            +
                           new_type = GGML_TYPE_Q6_K;
         
     | 
| 
      
 2490 
     | 
    
         
            +
                        } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
         
     | 
| 
       2399 
2491 
     | 
    
         
             
                            if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
         
     | 
| 
       2400 
2492 
     | 
    
         
             
                            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
         
     | 
| 
       2401 
2493 
     | 
    
         
             
                            else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
         
     | 
| 
       2402 
2494 
     | 
    
         
             
                                     (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
         
     | 
| 
       2403 
2495 
     | 
    
         
             
                                     (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
         
     | 
| 
       2404 
2496 
     | 
    
         
             
                            ++i_attention_wv;
         
     | 
| 
       2405 
     | 
    
         
            -
                        }
         
     | 
| 
       2406 
     | 
    
         
            -
                        if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
         
     | 
| 
      
 2497 
     | 
    
         
            +
                        } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
         
     | 
| 
       2407 
2498 
     | 
    
         
             
                            if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
         
     | 
| 
       2408 
2499 
     | 
    
         
             
                            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
         
     | 
| 
       2409 
2500 
     | 
    
         
             
                            else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
         
     | 
| 
       2410 
2501 
     | 
    
         
             
                                     (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
         
     | 
| 
       2411 
2502 
     | 
    
         
             
                                     (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
         
     | 
| 
       2412 
2503 
     | 
    
         
             
                            ++i_feed_forward_w2;
         
     | 
| 
       2413 
     | 
    
         
            -
                        }
         
     | 
| 
       2414 
     | 
    
         
            -
                        if (tensor.name.find("attention.wo.weight") != std::string::npos) {
         
     | 
| 
      
 2504 
     | 
    
         
            +
                        } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
         
     | 
| 
       2415 
2505 
     | 
    
         
             
                            if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
         
     | 
| 
       2416 
2506 
     | 
    
         
             
                            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
         
     | 
| 
       2417 
2507 
     | 
    
         
             
                        }
         
     | 
| 
      
 2508 
     | 
    
         
            +
            #endif
         
     | 
| 
       2418 
2509 
     | 
    
         | 
| 
       2419 
2510 
     | 
    
         
             
                        float * f32_data;
         
     | 
| 
       2420 
2511 
     | 
    
         
             
                        size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
         
     | 
| 
         @@ -2554,8 +2645,8 @@ struct llama_context * llama_init_from_file( 
     | 
|
| 
       2554 
2645 
     | 
    
         | 
| 
       2555 
2646 
     | 
    
         
             
                ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
         
     | 
| 
       2556 
2647 
     | 
    
         | 
| 
       2557 
     | 
    
         
            -
                if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
         
     | 
| 
       2558 
     | 
    
         
            -
                            params. 
     | 
| 
      
 2648 
     | 
    
         
            +
                if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
         
     | 
| 
      
 2649 
     | 
    
         
            +
                            params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
         
     | 
| 
       2559 
2650 
     | 
    
         
             
                            params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
         
     | 
| 
       2560 
2651 
     | 
    
         
             
                    fprintf(stderr, "%s: failed to load model\n", __func__);
         
     | 
| 
       2561 
2652 
     | 
    
         
             
                    llama_free(ctx);
         
     | 
| 
         @@ -2564,7 +2655,7 @@ struct llama_context * llama_init_from_file( 
     | 
|
| 
       2564 
2655 
     | 
    
         | 
| 
       2565 
2656 
     | 
    
         
             
                // reserve memory for context buffers
         
     | 
| 
       2566 
2657 
     | 
    
         
             
                if (!params.vocab_only) {
         
     | 
| 
       2567 
     | 
    
         
            -
                    if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
         
     | 
| 
      
 2658 
     | 
    
         
            +
                    if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
         
     | 
| 
       2568 
2659 
     | 
    
         
             
                        fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
         
     | 
| 
       2569 
2660 
     | 
    
         
             
                        llama_free(ctx);
         
     | 
| 
       2570 
2661 
     | 
    
         
             
                        return nullptr;
         
     | 
| 
         @@ -3301,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) { 
     | 
|
| 
       3301 
3392 
     | 
    
         
             
                return ctx->model.hparams.n_embd;
         
     | 
| 
       3302 
3393 
     | 
    
         
             
            }
         
     | 
| 
       3303 
3394 
     | 
    
         | 
| 
      
 3395 
     | 
    
         
            +
            int llama_get_vocab(
         
     | 
| 
      
 3396 
     | 
    
         
            +
                    const struct llama_context * ctx,
         
     | 
| 
      
 3397 
     | 
    
         
            +
                    const char * * strings,
         
     | 
| 
      
 3398 
     | 
    
         
            +
                    float  * scores,
         
     | 
| 
      
 3399 
     | 
    
         
            +
                    int capacity) {
         
     | 
| 
      
 3400 
     | 
    
         
            +
                int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
         
     | 
| 
      
 3401 
     | 
    
         
            +
                for (int i = 0; i<n; ++i) {
         
     | 
| 
      
 3402 
     | 
    
         
            +
                    strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
         
     | 
| 
      
 3403 
     | 
    
         
            +
                    scores[i]  = ctx->vocab.id_to_token[i].score;
         
     | 
| 
      
 3404 
     | 
    
         
            +
                }
         
     | 
| 
      
 3405 
     | 
    
         
            +
                return n;
         
     | 
| 
      
 3406 
     | 
    
         
            +
            }
         
     | 
| 
      
 3407 
     | 
    
         
            +
             
     | 
| 
       3304 
3408 
     | 
    
         
             
            float * llama_get_logits(struct llama_context * ctx) {
         
     | 
| 
       3305 
3409 
     | 
    
         
             
                return ctx->logits.data();
         
     | 
| 
       3306 
3410 
     | 
    
         
             
            }
         
     |