llama_cpp 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
| @@ -1,51 +1,20 @@ | |
| 1 1 | 
             
            #pragma once
         | 
| 2 2 |  | 
| 3 3 | 
             
            #include "ggml.h"
         | 
| 4 | 
            +
            #include "ggml-alloc.h"
         | 
| 4 5 |  | 
| 5 6 | 
             
            #ifdef  __cplusplus
         | 
| 6 7 | 
             
            extern "C" {
         | 
| 7 8 | 
             
            #endif
         | 
| 8 | 
            -
                struct ggml_backend;
         | 
| 9 | 
            -
                struct ggml_backend_buffer;
         | 
| 10 | 
            -
             | 
| 11 | 
            -
                // type-erased backend-specific types / wrappers
         | 
| 12 | 
            -
                typedef void * ggml_backend_context_t;
         | 
| 13 | 
            -
                typedef void * ggml_backend_graph_plan_t;
         | 
| 14 | 
            -
                typedef void * ggml_backend_buffer_context_t;
         | 
| 15 | 
            -
             | 
| 16 | 
            -
                // avoid accessing internals of these types
         | 
| 17 | 
            -
                typedef struct ggml_backend        * ggml_backend_t;
         | 
| 18 | 
            -
                typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
         | 
| 19 9 |  | 
| 20 10 | 
             
                //
         | 
| 21 | 
            -
                //  | 
| 11 | 
            +
                // Backend buffer
         | 
| 22 12 | 
             
                //
         | 
| 23 13 |  | 
| 24 | 
            -
                struct  | 
| 25 | 
            -
             | 
| 26 | 
            -
                    void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
         | 
| 27 | 
            -
                    size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
         | 
| 28 | 
            -
                    void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
         | 
| 29 | 
            -
                    void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
         | 
| 30 | 
            -
                };
         | 
| 31 | 
            -
             | 
| 32 | 
            -
                // TODO: hide behind API
         | 
| 33 | 
            -
                struct ggml_backend_buffer {
         | 
| 34 | 
            -
                    struct ggml_backend_buffer_i iface;
         | 
| 35 | 
            -
             | 
| 36 | 
            -
                    ggml_backend_t                backend;
         | 
| 37 | 
            -
                    ggml_backend_buffer_context_t context;
         | 
| 38 | 
            -
             | 
| 39 | 
            -
                    size_t size;
         | 
| 40 | 
            -
                };
         | 
| 14 | 
            +
                struct ggml_backend_buffer;
         | 
| 15 | 
            +
                typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
         | 
| 41 16 |  | 
| 42 17 | 
             
                // backend buffer functions
         | 
| 43 | 
            -
                GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
         | 
| 44 | 
            -
                        struct ggml_backend                  * backend,
         | 
| 45 | 
            -
                        struct ggml_backend_buffer_i           iface,
         | 
| 46 | 
            -
                               ggml_backend_buffer_context_t   context,
         | 
| 47 | 
            -
                               size_t                          size);
         | 
| 48 | 
            -
             | 
| 49 18 | 
             
                GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
         | 
| 50 19 | 
             
                GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
         | 
| 51 20 | 
             
                GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
         | 
| @@ -55,50 +24,13 @@ extern "C" { | |
| 55 24 | 
             
                GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
         | 
| 56 25 |  | 
| 57 26 | 
             
                //
         | 
| 58 | 
            -
                //  | 
| 27 | 
            +
                // Backend
         | 
| 59 28 | 
             
                //
         | 
| 60 29 |  | 
| 61 | 
            -
                struct  | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
                    void (*free)(ggml_backend_t backend);
         | 
| 65 | 
            -
             | 
| 66 | 
            -
                    // buffer allocation
         | 
| 67 | 
            -
                    ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
         | 
| 68 | 
            -
             | 
| 69 | 
            -
                    // get buffer alignment
         | 
| 70 | 
            -
                    size_t (*get_alignment)(ggml_backend_t backend);
         | 
| 71 | 
            -
             | 
| 72 | 
            -
                    // tensor data access
         | 
| 73 | 
            -
                    // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
         | 
| 74 | 
            -
                    void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
         | 
| 75 | 
            -
                    void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         | 
| 76 | 
            -
                    void (*synchronize)     (ggml_backend_t backend);
         | 
| 77 | 
            -
             | 
| 78 | 
            -
                    // (optional) copy tensor between different backends, allow for single-copy tranfers
         | 
| 79 | 
            -
                    void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
         | 
| 80 | 
            -
                    void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
         | 
| 81 | 
            -
             | 
| 82 | 
            -
                    // compute graph with a plan
         | 
| 83 | 
            -
                    ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
         | 
| 84 | 
            -
                    void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         | 
| 85 | 
            -
                    void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         | 
| 86 | 
            -
             | 
| 87 | 
            -
                    // compute graph without a plan
         | 
| 88 | 
            -
                    void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
         | 
| 89 | 
            -
             | 
| 90 | 
            -
                    // check if the backend supports an operation
         | 
| 91 | 
            -
                    bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
         | 
| 92 | 
            -
                };
         | 
| 93 | 
            -
             | 
| 94 | 
            -
                // TODO: hide behind API
         | 
| 95 | 
            -
                struct ggml_backend {
         | 
| 96 | 
            -
                    struct ggml_backend_i iface;
         | 
| 97 | 
            -
             | 
| 98 | 
            -
                    ggml_backend_context_t context;
         | 
| 99 | 
            -
                };
         | 
| 30 | 
            +
                struct ggml_backend;
         | 
| 31 | 
            +
                typedef struct ggml_backend * ggml_backend_t;
         | 
| 32 | 
            +
                typedef void * ggml_backend_graph_plan_t;
         | 
| 100 33 |  | 
| 101 | 
            -
                // backend helper functions
         | 
| 102 34 | 
             
                GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
         | 
| 103 35 |  | 
| 104 36 | 
             
                GGML_API const char * ggml_backend_name(ggml_backend_t backend);
         | 
| @@ -133,11 +65,72 @@ extern "C" { | |
| 133 65 | 
             
                GGML_API ggml_backend_t ggml_backend_cpu_init(void);
         | 
| 134 66 |  | 
| 135 67 | 
             
                GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
         | 
| 136 | 
            -
             | 
| 137 68 | 
             
                GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
         | 
| 138 69 |  | 
| 70 | 
            +
                // Create a backend buffer from an existing pointer
         | 
| 139 71 | 
             
                GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
         | 
| 140 72 |  | 
| 73 | 
            +
             | 
| 74 | 
            +
                //
         | 
| 75 | 
            +
                // Backend scheduler
         | 
| 76 | 
            +
                //
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                // The backend scheduler allows for multiple backends to be used together
         | 
| 79 | 
            +
                // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
         | 
| 80 | 
            +
                // The backends are selected based on:
         | 
| 81 | 
            +
                // - the backend that supports the operation
         | 
| 82 | 
            +
                // - the location of the pre-allocated tensors (e.g. the weights)
         | 
| 83 | 
            +
                /*
         | 
| 84 | 
            +
                  Example usage:
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                    sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
         | 
| 87 | 
            +
                    // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    // initialize buffers from a measure graph
         | 
| 90 | 
            +
                    measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                    // in build_graph:
         | 
| 93 | 
            +
                    build_graph(...) {
         | 
| 94 | 
            +
                        // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
         | 
| 95 | 
            +
                        alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
         | 
| 96 | 
            +
                        ggml_allocr_alloc(alloc_cpu, tensor);
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                        // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
         | 
| 99 | 
            +
                        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
         | 
| 100 | 
            +
                        ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
         | 
| 101 | 
            +
                    }
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                    // allocate backend buffers from measure graph
         | 
| 104 | 
            +
                    ggml_backend_sched_init_measure(sched, measure_graph);
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                    // the scheduler is now ready to compute graphs
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    // compute
         | 
| 109 | 
            +
                    graph = build_graph(sched);
         | 
| 110 | 
            +
                    ggml_backend_sched_graph_compute(sched, graph);
         | 
| 111 | 
            +
                */
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                struct ggml_backend_sched;
         | 
| 114 | 
            +
                typedef struct ggml_backend_sched * ggml_backend_sched_t;
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                // Initialize a backend scheduler
         | 
| 117 | 
            +
                GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                // Initialize backend buffers from a measure graph
         | 
| 122 | 
            +
                GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
         | 
| 125 | 
            +
                GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                // Allocate a graph on the backend scheduler
         | 
| 130 | 
            +
                GGML_API void ggml_backend_sched_graph_compute(
         | 
| 131 | 
            +
                        ggml_backend_sched_t sched,
         | 
| 132 | 
            +
                        struct ggml_cgraph * graph);
         | 
| 133 | 
            +
             | 
| 141 134 | 
             
            #ifdef  __cplusplus
         | 
| 142 135 | 
             
            }
         | 
| 143 136 | 
             
            #endif
         |