llama_cpp 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +6 -0
 - data/ext/llama_cpp/llama_cpp.cpp +16 -1
 - data/ext/llama_cpp/src/ggml-alloc.c +12 -4
 - data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
 - data/ext/llama_cpp/src/ggml-backend.c +75 -5
 - data/ext/llama_cpp/src/ggml-backend.h +7 -0
 - data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
 - data/ext/llama_cpp/src/ggml-metal.h +3 -0
 - data/ext/llama_cpp/src/ggml-metal.m +190 -44
 - data/ext/llama_cpp/src/ggml-metal.metal +11 -2
 - data/ext/llama_cpp/src/ggml.c +262 -89
 - data/ext/llama_cpp/src/ggml.h +24 -10
 - data/ext/llama_cpp/src/llama.cpp +926 -780
 - data/ext/llama_cpp/src/llama.h +8 -3
 - data/lib/llama_cpp/version.rb +2 -2
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: b4e94e20f142572fb46cff141e109025e1c5b91b9cd6cabfbaeac163a920bb82
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 10fcd2922c057a0c960cbde83a0fa22a56eae6e21a73e20931cfe082108dec26
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 570f5f257d947bad8489fcafe6c7cc0dc342d593132111d3479c0202e0998e444ca75c2140883dc8cf6a22f0f4f74fecd264a577cee6aabde4c168f5003b4c98
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: d105e41aac3bb39b2ee53c43eb320272ffa5290caa7e14e2f08bebbade94417aed065e01d23b8def9cce08fd6789576d33ba4082750647c0bbf9ff4c776db9ac
         
     | 
    
        data/CHANGELOG.md
    CHANGED
    
    | 
         @@ -1,3 +1,9 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ## [[0.10.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.1...v0.10.2)] - 2023-12-23
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            - Bump bundled llama.cpp from b1641 to b1686.
         
     | 
| 
      
 4 
     | 
    
         
            +
              - Add `LLAMA_FILE_MAGIC_GGLA` constant.
         
     | 
| 
      
 5 
     | 
    
         
            +
              - Add `n_batch` method to `Context`.
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
       1 
7 
     | 
    
         
             
            ## [[0.10.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.0...v0.10.1)] - 2023-12-16
         
     | 
| 
       2 
8 
     | 
    
         | 
| 
       3 
9 
     | 
    
         
             
            - Bump bundled llama.cpp from b1620 to b1641.
         
     | 
    
        data/ext/llama_cpp/llama_cpp.cpp
    CHANGED
    
    | 
         @@ -1949,6 +1949,7 @@ public: 
     | 
|
| 
       1949 
1949 
     | 
    
         
             
                rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
         
     | 
| 
       1950 
1950 
     | 
    
         
             
                rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
         
     | 
| 
       1951 
1951 
     | 
    
         
             
                rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
         
     | 
| 
      
 1952 
     | 
    
         
            +
                rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
         
     | 
| 
       1952 
1953 
     | 
    
         
             
                rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
         
     | 
| 
       1953 
1954 
     | 
    
         
             
                rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
         
     | 
| 
       1954 
1955 
     | 
    
         
             
                rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
         
     | 
| 
         @@ -2201,7 +2202,16 @@ private: 
     | 
|
| 
       2201 
2202 
     | 
    
         
             
                  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
         
     | 
| 
       2202 
2203 
     | 
    
         
             
                  return Qnil;
         
     | 
| 
       2203 
2204 
     | 
    
         
             
                }
         
     | 
| 
       2204 
     | 
    
         
            -
                return  
     | 
| 
      
 2205 
     | 
    
         
            +
                return UINT2NUM(llama_n_ctx(ptr->ctx));
         
     | 
| 
      
 2206 
     | 
    
         
            +
              }
         
     | 
| 
      
 2207 
     | 
    
         
            +
             
     | 
| 
      
 2208 
     | 
    
         
            +
              static VALUE _llama_context_n_batch(VALUE self) {
         
     | 
| 
      
 2209 
     | 
    
         
            +
                LLaMAContextWrapper* ptr = get_llama_context(self);
         
     | 
| 
      
 2210 
     | 
    
         
            +
                if (ptr->ctx == NULL) {
         
     | 
| 
      
 2211 
     | 
    
         
            +
                  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
         
     | 
| 
      
 2212 
     | 
    
         
            +
                  return Qnil;
         
     | 
| 
      
 2213 
     | 
    
         
            +
                }
         
     | 
| 
      
 2214 
     | 
    
         
            +
                return UINT2NUM(llama_n_batch(ptr->ctx));
         
     | 
| 
       2205 
2215 
     | 
    
         
             
              }
         
     | 
| 
       2206 
2216 
     | 
    
         | 
| 
       2207 
2217 
     | 
    
         
             
              static VALUE _llama_context_get_timings(VALUE self) {
         
     | 
| 
         @@ -3146,6 +3156,11 @@ extern "C" void Init_llama_cpp(void) { 
     | 
|
| 
       3146 
3156 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
         
     | 
| 
       3147 
3157 
     | 
    
         | 
| 
       3148 
3158 
     | 
    
         
             
              std::stringstream ss_magic;
         
     | 
| 
      
 3159 
     | 
    
         
            +
              ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
         
     | 
| 
      
 3160 
     | 
    
         
            +
              rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
         
     | 
| 
      
 3161 
     | 
    
         
            +
             
     | 
| 
      
 3162 
     | 
    
         
            +
              ss_magic.str("");
         
     | 
| 
      
 3163 
     | 
    
         
            +
              ss_magic.clear(std::stringstream::goodbit);
         
     | 
| 
       3149 
3164 
     | 
    
         
             
              ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
         
     | 
| 
       3150 
3165 
     | 
    
         
             
              rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
         
     | 
| 
       3151 
3166 
     | 
    
         | 
| 
         @@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd 
     | 
|
| 
       449 
449 
     | 
    
         
             
                if (update_backend) {
         
     | 
| 
       450 
450 
     | 
    
         
             
                    view->backend = view->view_src->backend;
         
     | 
| 
       451 
451 
     | 
    
         
             
                }
         
     | 
| 
       452 
     | 
    
         
            -
                 
     | 
| 
      
 452 
     | 
    
         
            +
                // views are initialized in the alloc buffer rather than the view_src buffer
         
     | 
| 
      
 453 
     | 
    
         
            +
                view->buffer  = alloc->buffer;
         
     | 
| 
       453 
454 
     | 
    
         
             
                view->data    = (char *)view->view_src->data + view->view_offs;
         
     | 
| 
       454 
455 
     | 
    
         | 
| 
       455 
     | 
    
         
            -
                // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
         
     | 
| 
       456 
     | 
    
         
            -
                // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
         
     | 
| 
       457 
456 
     | 
    
         
             
                assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
         
     | 
| 
       458 
457 
     | 
    
         | 
| 
       459 
458 
     | 
    
         
             
                if (!alloc->measure) {
         
     | 
| 
         @@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) { 
     | 
|
| 
       736 
735 
     | 
    
         
             
            }
         
     | 
| 
       737 
736 
     | 
    
         | 
| 
       738 
737 
     | 
    
         
             
            void ggml_allocr_free(ggml_allocr_t alloc) {
         
     | 
| 
      
 738 
     | 
    
         
            +
                if (alloc == NULL) {
         
     | 
| 
      
 739 
     | 
    
         
            +
                    return;
         
     | 
| 
      
 740 
     | 
    
         
            +
                }
         
     | 
| 
      
 741 
     | 
    
         
            +
             
     | 
| 
       739 
742 
     | 
    
         
             
                ggml_gallocr_free(alloc->galloc);
         
     | 
| 
       740 
743 
     | 
    
         
             
                ggml_tallocr_free(alloc->talloc);
         
     | 
| 
       741 
744 
     | 
    
         
             
                free(alloc);
         
     | 
| 
         @@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte 
     | 
|
| 
       775 
778 
     | 
    
         
             
                }
         
     | 
| 
       776 
779 
     | 
    
         | 
| 
       777 
780 
     | 
    
         
             
                if (nbytes == 0) {
         
     | 
| 
       778 
     | 
    
         
            -
                     
     | 
| 
      
 781 
     | 
    
         
            +
                    // all the tensors in the context are already allocated
         
     | 
| 
       779 
782 
     | 
    
         
             
                    return NULL;
         
     | 
| 
       780 
783 
     | 
    
         
             
                }
         
     | 
| 
       781 
784 
     | 
    
         | 
| 
         @@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte 
     | 
|
| 
       789 
792 
     | 
    
         
             
                        } else {
         
     | 
| 
       790 
793 
     | 
    
         
             
                            ggml_backend_view_init(buffer, t);
         
     | 
| 
       791 
794 
     | 
    
         
             
                        }
         
     | 
| 
      
 795 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 796 
     | 
    
         
            +
                        if (t->view_src != NULL) {
         
     | 
| 
      
 797 
     | 
    
         
            +
                            // view of a pre-allocated tensor
         
     | 
| 
      
 798 
     | 
    
         
            +
                            ggml_backend_view_init(buffer, t);
         
     | 
| 
      
 799 
     | 
    
         
            +
                        }
         
     | 
| 
       792 
800 
     | 
    
         
             
                    }
         
     | 
| 
       793 
801 
     | 
    
         
             
                }
         
     | 
| 
       794 
802 
     | 
    
         | 
| 
         @@ -20,6 +20,9 @@ extern "C" { 
     | 
|
| 
       20 
20 
     | 
    
         
             
                    size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
         
     | 
| 
       21 
21 
     | 
    
         
             
                    size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
         
     | 
| 
       22 
22 
     | 
    
         
             
                    bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
         
     | 
| 
      
 23 
     | 
    
         
            +
                    // check if tensor data is in host memory
         
     | 
| 
      
 24 
     | 
    
         
            +
                    // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
         
     | 
| 
      
 25 
     | 
    
         
            +
                    bool                  (*is_host)         (ggml_backend_buffer_type_t buft);
         
     | 
| 
       23 
26 
     | 
    
         
             
                };
         
     | 
| 
       24 
27 
     | 
    
         | 
| 
       25 
28 
     | 
    
         
             
                struct ggml_backend_buffer_type {
         
     | 
| 
         @@ -31,15 +34,16 @@ extern "C" { 
     | 
|
| 
       31 
34 
     | 
    
         
             
                typedef void * ggml_backend_buffer_context_t;
         
     | 
| 
       32 
35 
     | 
    
         | 
| 
       33 
36 
     | 
    
         
             
                struct ggml_backend_buffer_i {
         
     | 
| 
       34 
     | 
    
         
            -
                    void 
     | 
| 
      
 37 
     | 
    
         
            +
                    void   (*free_buffer)    (ggml_backend_buffer_t buffer);
         
     | 
| 
       35 
38 
     | 
    
         
             
                    //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
         
     | 
| 
       36 
     | 
    
         
            -
                    void * 
     | 
| 
       37 
     | 
    
         
            -
                    void 
     | 
| 
       38 
     | 
    
         
            -
                    void 
     | 
| 
       39 
     | 
    
         
            -
                    void 
     | 
| 
      
 39 
     | 
    
         
            +
                    void * (*get_base)       (ggml_backend_buffer_t buffer);
         
     | 
| 
      
 40 
     | 
    
         
            +
                    void   (*init_tensor)    (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
         
     | 
| 
      
 41 
     | 
    
         
            +
                    void   (*set_tensor)     (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         
     | 
| 
      
 42 
     | 
    
         
            +
                    void   (*get_tensor)     (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         
     | 
| 
       40 
43 
     | 
    
         
             
                    // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
         
     | 
| 
       41 
     | 
    
         
            -
                    void 
     | 
| 
       42 
     | 
    
         
            -
                    void 
     | 
| 
      
 44 
     | 
    
         
            +
                    void   (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
         
     | 
| 
      
 45 
     | 
    
         
            +
                    void   (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
         
     | 
| 
      
 46 
     | 
    
         
            +
                    void   (*clear)          (ggml_backend_buffer_t buffer, uint8_t value);
         
     | 
| 
       43 
47 
     | 
    
         
             
                };
         
     | 
| 
       44 
48 
     | 
    
         | 
| 
       45 
49 
     | 
    
         
             
                struct ggml_backend_buffer {
         
     | 
| 
         @@ -78,7 +82,7 @@ extern "C" { 
     | 
|
| 
       78 
82 
     | 
    
         
             
                    void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
         
     | 
| 
       79 
83 
     | 
    
         
             
                    void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
         
     | 
| 
       80 
84 
     | 
    
         | 
| 
       81 
     | 
    
         
            -
                    void (*synchronize) 
     | 
| 
      
 85 
     | 
    
         
            +
                    void (*synchronize)(ggml_backend_t backend);
         
     | 
| 
       82 
86 
     | 
    
         | 
| 
       83 
87 
     | 
    
         
             
                    // compute graph with a plan
         
     | 
| 
       84 
88 
     | 
    
         
             
                    ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
         
     | 
| 
         @@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba 
     | 
|
| 
       35 
35 
     | 
    
         
             
                return buft->iface.supports_backend(buft, backend);
         
     | 
| 
       36 
36 
     | 
    
         
             
            }
         
     | 
| 
       37 
37 
     | 
    
         | 
| 
      
 38 
     | 
    
         
            +
            bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
         
     | 
| 
      
 39 
     | 
    
         
            +
                if (buft->iface.is_host) {
         
     | 
| 
      
 40 
     | 
    
         
            +
                    return buft->iface.is_host(buft);
         
     | 
| 
      
 41 
     | 
    
         
            +
                }
         
     | 
| 
      
 42 
     | 
    
         
            +
                return false;
         
     | 
| 
      
 43 
     | 
    
         
            +
            }
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
       38 
45 
     | 
    
         
             
            // backend buffer
         
     | 
| 
       39 
46 
     | 
    
         | 
| 
       40 
47 
     | 
    
         
             
            ggml_backend_buffer_t ggml_backend_buffer_init(
         
     | 
| 
         @@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g 
     | 
|
| 
       94 
101 
     | 
    
         
             
                return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
         
     | 
| 
       95 
102 
     | 
    
         
             
            }
         
     | 
| 
       96 
103 
     | 
    
         | 
| 
      
 104 
     | 
    
         
            +
            void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
         
     | 
| 
      
 105 
     | 
    
         
            +
                buffer->iface.clear(buffer, value);
         
     | 
| 
      
 106 
     | 
    
         
            +
            }
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
            bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
         
     | 
| 
      
 109 
     | 
    
         
            +
                return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
         
     | 
| 
      
 110 
     | 
    
         
            +
            }
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
       97 
112 
     | 
    
         
             
            ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
         
     | 
| 
       98 
113 
     | 
    
         
             
                return buffer->buft;
         
     | 
| 
       99 
114 
     | 
    
         
             
            }
         
     | 
| 
         @@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { 
     | 
|
| 
       378 
393 
     | 
    
         | 
| 
       379 
394 
     | 
    
         
             
            static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
         
     | 
| 
       380 
395 
     | 
    
         
             
                free(buffer->context);
         
     | 
| 
       381 
     | 
    
         
            -
                GGML_UNUSED(buffer);
         
     | 
| 
       382 
396 
     | 
    
         
             
            }
         
     | 
| 
       383 
397 
     | 
    
         | 
| 
       384 
398 
     | 
    
         
             
            static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
         
     | 
| 
         @@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, 
     | 
|
| 
       411 
425 
     | 
    
         
             
                GGML_UNUSED(buffer);
         
     | 
| 
       412 
426 
     | 
    
         
             
            }
         
     | 
| 
       413 
427 
     | 
    
         | 
| 
      
 428 
     | 
    
         
            +
            static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
         
     | 
| 
      
 429 
     | 
    
         
            +
                memset(buffer->context, value, buffer->size);
         
     | 
| 
      
 430 
     | 
    
         
            +
            }
         
     | 
| 
      
 431 
     | 
    
         
            +
             
     | 
| 
       414 
432 
     | 
    
         
             
            static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
         
     | 
| 
       415 
433 
     | 
    
         
             
                /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
         
     | 
| 
       416 
434 
     | 
    
         
             
                /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
         
     | 
| 
         @@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { 
     | 
|
| 
       419 
437 
     | 
    
         
             
                /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
         
     | 
| 
       420 
438 
     | 
    
         
             
                /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
         
     | 
| 
       421 
439 
     | 
    
         
             
                /* .cpy_tensor_to   = */ ggml_backend_cpu_buffer_cpy_tensor_to,
         
     | 
| 
      
 440 
     | 
    
         
            +
                /* .clear           = */ ggml_backend_cpu_buffer_clear,
         
     | 
| 
       422 
441 
     | 
    
         
             
            };
         
     | 
| 
       423 
442 
     | 
    
         | 
| 
       424 
443 
     | 
    
         
             
            // for buffers from ptr, free is not called
         
     | 
| 
         @@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { 
     | 
|
| 
       430 
449 
     | 
    
         
             
                /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
         
     | 
| 
       431 
450 
     | 
    
         
             
                /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
         
     | 
| 
       432 
451 
     | 
    
         
             
                /* .cpy_tensor_to   = */ ggml_backend_cpu_buffer_cpy_tensor_to,
         
     | 
| 
      
 452 
     | 
    
         
            +
                /* .clear           = */ ggml_backend_cpu_buffer_clear,
         
     | 
| 
       433 
453 
     | 
    
         
             
            };
         
     | 
| 
       434 
454 
     | 
    
         | 
| 
       435 
455 
     | 
    
         
             
            static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
         
     | 
| 
         @@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty 
     | 
|
| 
       455 
475 
     | 
    
         
             
                GGML_UNUSED(buft);
         
     | 
| 
       456 
476 
     | 
    
         
             
            }
         
     | 
| 
       457 
477 
     | 
    
         | 
| 
      
 478 
     | 
    
         
            +
            static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
         
     | 
| 
      
 479 
     | 
    
         
            +
                return true;
         
     | 
| 
      
 480 
     | 
    
         
            +
             
     | 
| 
      
 481 
     | 
    
         
            +
                GGML_UNUSED(buft);
         
     | 
| 
      
 482 
     | 
    
         
            +
            }
         
     | 
| 
      
 483 
     | 
    
         
            +
             
     | 
| 
       458 
484 
     | 
    
         
             
            ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
         
     | 
| 
       459 
     | 
    
         
            -
                static struct ggml_backend_buffer_type  
     | 
| 
      
 485 
     | 
    
         
            +
                static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
         
     | 
| 
       460 
486 
     | 
    
         
             
                    /* .iface = */ {
         
     | 
| 
       461 
487 
     | 
    
         
             
                        /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
         
     | 
| 
       462 
488 
     | 
    
         
             
                        /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
         
     | 
| 
       463 
489 
     | 
    
         
             
                        /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
         
     | 
| 
       464 
490 
     | 
    
         
             
                        /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
         
     | 
| 
      
 491 
     | 
    
         
            +
                        /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         
     | 
| 
       465 
492 
     | 
    
         
             
                    },
         
     | 
| 
       466 
493 
     | 
    
         
             
                    /* .context = */ NULL,
         
     | 
| 
       467 
494 
     | 
    
         
             
                };
         
     | 
| 
       468 
495 
     | 
    
         | 
| 
       469 
     | 
    
         
            -
                return & 
     | 
| 
      
 496 
     | 
    
         
            +
                return &ggml_backend_cpu_buffer_type;
         
     | 
| 
       470 
497 
     | 
    
         
             
            }
         
     | 
| 
       471 
498 
     | 
    
         | 
| 
      
 499 
     | 
    
         
            +
            #ifdef GGML_USE_CPU_HBM
         
     | 
| 
      
 500 
     | 
    
         
            +
             
     | 
| 
      
 501 
     | 
    
         
            +
            // buffer type HBM
         
     | 
| 
      
 502 
     | 
    
         
            +
             
     | 
| 
      
 503 
     | 
    
         
            +
            #include <hbwmalloc.h>
         
     | 
| 
      
 504 
     | 
    
         
            +
             
     | 
| 
      
 505 
     | 
    
         
            +
            static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
         
     | 
| 
      
 506 
     | 
    
         
            +
                hbw_free(buffer->context);
         
     | 
| 
      
 507 
     | 
    
         
            +
            }
         
     | 
| 
      
 508 
     | 
    
         
            +
             
     | 
| 
      
 509 
     | 
    
         
            +
            static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
         
     | 
| 
      
 510 
     | 
    
         
            +
                //void * ptr = hbw_malloc(size);
         
     | 
| 
      
 511 
     | 
    
         
            +
                void * ptr;
         
     | 
| 
      
 512 
     | 
    
         
            +
                int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
         
     | 
| 
      
 513 
     | 
    
         
            +
                if (result != 0) {
         
     | 
| 
      
 514 
     | 
    
         
            +
                    fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
         
     | 
| 
      
 515 
     | 
    
         
            +
                    return NULL;
         
     | 
| 
      
 516 
     | 
    
         
            +
                }
         
     | 
| 
      
 517 
     | 
    
         
            +
             
     | 
| 
      
 518 
     | 
    
         
            +
                // FIXME: this is a hack to avoid having to implement a new buffer type
         
     | 
| 
      
 519 
     | 
    
         
            +
                ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
         
     | 
| 
      
 520 
     | 
    
         
            +
                buffer->buft = buft;
         
     | 
| 
      
 521 
     | 
    
         
            +
                buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
         
     | 
| 
      
 522 
     | 
    
         
            +
             
     | 
| 
      
 523 
     | 
    
         
            +
                return buffer;
         
     | 
| 
      
 524 
     | 
    
         
            +
            }
         
     | 
| 
      
 525 
     | 
    
         
            +
             
     | 
| 
      
 526 
     | 
    
         
            +
            ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
         
     | 
| 
      
 527 
     | 
    
         
            +
                static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
         
     | 
| 
      
 528 
     | 
    
         
            +
                    /* .iface    = */ {
         
     | 
| 
      
 529 
     | 
    
         
            +
                        /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
         
     | 
| 
      
 530 
     | 
    
         
            +
                        /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
         
     | 
| 
      
 531 
     | 
    
         
            +
                        /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
         
     | 
| 
      
 532 
     | 
    
         
            +
                        /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
         
     | 
| 
      
 533 
     | 
    
         
            +
                        /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
         
     | 
| 
      
 534 
     | 
    
         
            +
                    },
         
     | 
| 
      
 535 
     | 
    
         
            +
                    /* .context  = */ NULL,
         
     | 
| 
      
 536 
     | 
    
         
            +
                };
         
     | 
| 
      
 537 
     | 
    
         
            +
             
     | 
| 
      
 538 
     | 
    
         
            +
                return &ggml_backend_cpu_buffer_type_hbm;
         
     | 
| 
      
 539 
     | 
    
         
            +
            }
         
     | 
| 
      
 540 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 541 
     | 
    
         
            +
             
     | 
| 
       472 
542 
     | 
    
         
             
            struct ggml_backend_cpu_context {
         
     | 
| 
       473 
543 
     | 
    
         
             
                int n_threads;
         
     | 
| 
       474 
544 
     | 
    
         
             
                void * work_data;
         
     | 
| 
         @@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend 
     | 
|
| 
       505 
575 
     | 
    
         
             
                struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
         
     | 
| 
       506 
576 
     | 
    
         | 
| 
       507 
577 
     | 
    
         
             
                cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
         
     | 
| 
       508 
     | 
    
         
            -
                cpu_plan->cgraph = *cgraph;
         
     | 
| 
      
 578 
     | 
    
         
            +
                cpu_plan->cgraph = *cgraph; // FIXME: deep copy
         
     | 
| 
       509 
579 
     | 
    
         | 
| 
       510 
580 
     | 
    
         
             
                if (cpu_plan->cplan.work_size > 0) {
         
     | 
| 
       511 
581 
     | 
    
         
             
                    cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
         
     | 
| 
         @@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml 
     | 
|
| 
       1180 
1250 
     | 
    
         
             
            // utils
         
     | 
| 
       1181 
1251 
     | 
    
         
             
            void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
         
     | 
| 
       1182 
1252 
     | 
    
         
             
                GGML_ASSERT(tensor->buffer == NULL);
         
     | 
| 
       1183 
     | 
    
         
            -
                GGML_ASSERT(tensor->data == NULL);
         
     | 
| 
      
 1253 
     | 
    
         
            +
                //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
         
     | 
| 
       1184 
1254 
     | 
    
         
             
                GGML_ASSERT(tensor->view_src != NULL);
         
     | 
| 
       1185 
1255 
     | 
    
         
             
                GGML_ASSERT(tensor->view_src->buffer != NULL);
         
     | 
| 
       1186 
1256 
     | 
    
         
             
                GGML_ASSERT(tensor->view_src->data != NULL);
         
     | 
| 
         @@ -21,6 +21,7 @@ extern "C" { 
     | 
|
| 
       21 
21 
     | 
    
         
             
                GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
         
     | 
| 
       22 
22 
     | 
    
         
             
                GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
         
     | 
| 
       23 
23 
     | 
    
         
             
                GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
         
     | 
| 
      
 24 
     | 
    
         
            +
                GGML_API bool ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
         
     | 
| 
       24 
25 
     | 
    
         | 
| 
       25 
26 
     | 
    
         
             
                // buffer
         
     | 
| 
       26 
27 
     | 
    
         
             
                GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
         
     | 
| 
         @@ -29,6 +30,8 @@ extern "C" { 
     | 
|
| 
       29 
30 
     | 
    
         
             
                GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
         
     | 
| 
       30 
31 
     | 
    
         
             
                GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
         
     | 
| 
       31 
32 
     | 
    
         
             
                GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
         
     | 
| 
      
 33 
     | 
    
         
            +
                GGML_API void   ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
         
     | 
| 
      
 34 
     | 
    
         
            +
                GGML_API bool   ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
         
     | 
| 
       32 
35 
     | 
    
         
             
                GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
         
     | 
| 
       33 
36 
     | 
    
         | 
| 
       34 
37 
     | 
    
         
             
                //
         
     | 
| 
         @@ -76,6 +79,10 @@ extern "C" { 
     | 
|
| 
       76 
79 
     | 
    
         | 
| 
       77 
80 
     | 
    
         
             
                GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
         
     | 
| 
       78 
81 
     | 
    
         | 
| 
      
 82 
     | 
    
         
            +
            #ifdef GGML_USE_CPU_HBM
         
     | 
| 
      
 83 
     | 
    
         
            +
                GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
         
     | 
| 
      
 84 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
       79 
86 
     | 
    
         
             
                //
         
     | 
| 
       80 
87 
     | 
    
         
             
                // Backend registry
         
     | 
| 
       81 
88 
     | 
    
         
             
                //
         
     |