RubyGems - llama_cpp - Versions diffs - 0.5.2 → 0.6.0 - Mend

llama_cpp 0.5.2 → 0.6.0

Files changed (22) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/README.md +6 -5
data/examples/chat.rb +13 -13
data/examples/embedding.rb +9 -9
data/ext/llama_cpp/llama_cpp.cpp +547 -272
data/ext/llama_cpp/src/ggml-alloc.c +14 -8
data/ext/llama_cpp/src/ggml-alloc.h +1 -0
data/ext/llama_cpp/src/ggml-cuda.cu +307 -127
data/ext/llama_cpp/src/ggml-cuda.h +1 -0
data/ext/llama_cpp/src/ggml-metal.h +4 -0
data/ext/llama_cpp/src/ggml-metal.m +200 -94
data/ext/llama_cpp/src/ggml-metal.metal +264 -82
data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
data/ext/llama_cpp/src/ggml.c +1647 -865
data/ext/llama_cpp/src/ggml.h +143 -52
data/ext/llama_cpp/src/llama.cpp +1427 -635
data/ext/llama_cpp/src/llama.h +308 -119
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +5 -9
data/sig/llama_cpp.rbs +65 -34
metadata +3 -3

data/ext/llama_cpp/src/ggml.h CHANGED Viewed

@@ -195,6 +195,14 @@
 #    define GGML_DEPRECATED(func, hint) func
 #endif
+#ifndef __GNUC__
+#    define GGML_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
@@ -206,8 +214,8 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 #define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        256
+#define GGML_MAX_NODES         16384
+#define GGML_MAX_PARAMS        1024
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
@@ -240,6 +248,14 @@
         } \
     } while (0)
+#ifndef NDEBUG
+#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#elif defined(__GNUC__)
+#define GGML_UNREACHABLE() __builtin_unreachable()
+#else
+#define GGML_UNREACHABLE() ((void) 0)
+#endif
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@@ -270,7 +286,7 @@ extern "C" {
 #if defined(__ARM_NEON) && defined(__CUDACC__)
     typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+#elif defined(__ARM_NEON)
     typedef __fp16 ggml_fp16_t;
 #else
     typedef uint16_t ggml_fp16_t;
@@ -437,6 +453,12 @@ extern "C" {
         GGML_OBJECT_WORK_BUFFER
     };
+    enum ggml_log_level {
+        GGML_LOG_LEVEL_ERROR = 2,
+        GGML_LOG_LEVEL_WARN = 3,
+        GGML_LOG_LEVEL_INFO = 4
+    };
     // ggml object
     struct ggml_object {
         size_t offs;
@@ -459,8 +481,8 @@ extern "C" {
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
         size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
-                                   // nb[0] = sizeof(type)
-                                   // nb[1] = nb[0]   * ne[0] + padding
+                                   // nb[0] = ggml_type_size(type)
+                                   // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
                                    // nb[i] = nb[i-1] * ne[i-1]
         // compute data
@@ -512,7 +534,15 @@ extern "C" {
     // next prime after GGML_MAX_NODES
     // #define GGML_GRAPH_HASHTABLE_SIZE 4099
     // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
-    #define GGML_GRAPH_HASHTABLE_SIZE 8273
+    // #define GGML_GRAPH_HASHTABLE_SIZE 8273
+    // #define GGML_GRAPH_HASHTABLE_SIZE 16411
+    #define GGML_GRAPH_HASHTABLE_SIZE 32771
+    enum ggml_cgraph_eval_order {
+        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+        GGML_CGRAPH_EVAL_ORDER_COUNT
+    };
     // computation graph
     struct ggml_cgraph {
@@ -525,6 +555,8 @@ extern "C" {
         void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+        enum ggml_cgraph_eval_order order;
         // performance
         int     perf_runs;
         int64_t perf_cycles;
@@ -672,12 +704,21 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
     GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+    // Converts a flat index into coordinates
+    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
     GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
     GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
     GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
     GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
     GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
     GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
@@ -685,6 +726,7 @@ extern "C" {
     GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
+    GGML_ATTRIBUTE_FORMAT(2, 3)
     GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
     //
@@ -710,6 +752,12 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
+    GGML_API struct ggml_tensor * ggml_add_cast(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            enum   ggml_type      type);
     GGML_API struct ggml_tensor * ggml_add1(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -819,6 +867,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
+    // sums repetitions in a into shape of b
     GGML_API struct ggml_tensor * ggml_repeat_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1040,7 +1089,6 @@ extern "C" {
             size_t                nb1,
             size_t                offset);
     // a -> b, return view(b)
     GGML_API struct ggml_tensor * ggml_cpy(
             struct ggml_context * ctx,
@@ -1063,6 +1111,33 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
+    // make contiguous, with new shape
+    GGML_API struct ggml_tensor * ggml_cont_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+    GGML_API struct ggml_tensor * ggml_cont_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+    GGML_API struct ggml_tensor * ggml_cont_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+    GGML_API struct ggml_tensor * ggml_cont_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
     // return view(a), b specifies the new shape
     // TODO: when we start computing gradient, make a copy instead of view
     GGML_API struct ggml_tensor * ggml_reshape(
@@ -1210,14 +1285,15 @@ extern "C" {
             struct ggml_tensor  * b);
     // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements
+    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
     // if mode & 2 == 1, GPT-NeoX style
     // if mode & 4 == 1, ChatGLM style
-    // TODO: avoid creating a new tensor every time
+    //
+    // b is an int32 vector with size a->ne[2], it contains the positions
     GGML_API struct ggml_tensor * ggml_rope(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx);
@@ -1226,7 +1302,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx);
@@ -1235,7 +1311,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_custom(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
@@ -1246,7 +1322,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
@@ -1257,7 +1333,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             float                 base,
             bool                  down);
@@ -1267,7 +1343,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
@@ -1647,6 +1723,16 @@ extern "C" {
     // dump the graph into a file using the dot format
     GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // gb_tmp will contain original backward graph with rewritten backward process nodes,
+    // but without the second forward pass nodes.
+    GGML_API void ggml_build_backward_gradient_checkpointing(
+            struct ggml_context   * ctx,
+            struct ggml_cgraph    * gf,
+            struct ggml_cgraph    * gb,
+            struct ggml_cgraph    * gb_tmp,
+            struct ggml_tensor  * * checkpoints,
+            int                     n_checkpoints);
     //
     // optimization
     //
@@ -1681,7 +1767,8 @@ extern "C" {
         GGML_LINESEARCH_INVALID_PARAMETERS,
     };
-    typedef void (*ggml_opt_callback)(void * data, float * sched);
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
     // optimization parameters
     //
@@ -1712,6 +1799,8 @@ extern "C" {
         bool print_forward_graph;
         bool print_backward_graph;
+        int n_gradient_accumulation;
         // ADAM parameters
         struct {
             int n_iter;
@@ -1757,6 +1846,7 @@ extern "C" {
         float loss_after;
         struct {
+            struct ggml_tensor * g;  // current gradient
             struct ggml_tensor * m;  // first moment
             struct ggml_tensor * v;  // second moment
             struct ggml_tensor * pf; // past function values
@@ -1866,39 +1956,39 @@ extern "C" {
     GGML_API const char * gguf_type_name(enum gguf_type type);
-    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
-    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
-    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
-    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
-    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
-    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
-    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
-    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
-    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
-    // results are undefined if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
-    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
-    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
-    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
-    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
-    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
-    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
-    GGML_API uint64_t     gguf_get_val_u64 (struct gguf_context * ctx, int i);
-    GGML_API int64_t      gguf_get_val_i64 (struct gguf_context * ctx, int i);
-    GGML_API double       gguf_get_val_f64 (struct gguf_context * ctx, int i);
-    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
-    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
-    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
-    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
-    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
-    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
-    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
-    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
-    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
+    GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
+    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
+    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
+    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
+    // will abort if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
+    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
+    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
+    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
+    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
+    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
+    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
+    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
+    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
+    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
+    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
+    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
+    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
+    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
+    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
+    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
+    GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
     // overrides existing values or adds a new one
     GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
@@ -1943,11 +2033,11 @@ extern "C" {
     //
     // write the entire context to a binary file
-    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
+    GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
     // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
-    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
+    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
     //
     // system info
@@ -1961,6 +2051,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_fma        (void);
     GGML_API int ggml_cpu_has_neon       (void);
     GGML_API int ggml_cpu_has_arm_fma    (void);
+    GGML_API int ggml_cpu_has_metal      (void);
     GGML_API int ggml_cpu_has_f16c       (void);
     GGML_API int ggml_cpu_has_fp16_va    (void);
     GGML_API int ggml_cpu_has_wasm_simd  (void);