RubyGems - llama_cpp - Versions diffs - 0.0.3 → 0.0.5 - Mend

llama_cpp 0.0.3 → 0.0.5

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +36 -0
data/README.md +5 -4
data/ext/llama_cpp/extconf.rb +38 -0
data/ext/llama_cpp/llama_cpp.cpp +118 -2
data/ext/llama_cpp/src/ggml.c +1740 -658
data/ext/llama_cpp/src/ggml.h +84 -16
data/ext/llama_cpp/src/llama.cpp +1108 -756
data/ext/llama_cpp/src/llama.h +37 -1
data/ext/llama_cpp/src/llama_util.h +396 -0
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +3 -3
data/sig/llama_cpp.rbs +6 -0
metadata +3 -2

data/ext/llama_cpp/src/ggml.h CHANGED Viewed

@@ -177,11 +177,12 @@ extern "C" {
 #include <stddef.h>
 #include <stdbool.h>
-#define GGML_MAX_DIMS     4
-#define GGML_MAX_NODES    4096
-#define GGML_MAX_PARAMS   16
-#define GGML_MAX_CONTEXTS 64
-#define GGML_MAX_OPT      4
+#define GGML_MAX_DIMS          4
+#define GGML_MAX_NODES         4096
+#define GGML_MAX_PARAMS        16
+#define GGML_MAX_CONTEXTS      64
+#define GGML_MAX_OPT           4
+#define GGML_DEFAULT_N_THREADS 4
 #ifdef __ARM_NEON
 // we use the built-in 16-bit float type
@@ -198,13 +199,15 @@ struct ggml_object;
 struct ggml_context;
 enum ggml_type {
-    GGML_TYPE_Q4_0,
-    GGML_TYPE_Q4_1,
+    // explicitly numbered values are used in llama.cpp files
+    GGML_TYPE_F32  = 0,
+    GGML_TYPE_F16  = 1,
+    GGML_TYPE_Q4_0 = 2,
+    GGML_TYPE_Q4_1 = 3,
+    GGML_TYPE_Q8_0 = 4,
     GGML_TYPE_I8,
     GGML_TYPE_I16,
     GGML_TYPE_I32,
-    GGML_TYPE_F16,
-    GGML_TYPE_F32,
     GGML_TYPE_COUNT,
 };
@@ -236,6 +239,7 @@ enum ggml_op {
     GGML_OP_SCALE,
     GGML_OP_CPY,
+    GGML_OP_CONT,
     GGML_OP_RESHAPE,
     GGML_OP_VIEW,
     GGML_OP_PERMUTE,
@@ -250,9 +254,25 @@ enum ggml_op {
     GGML_OP_FLASH_ATTN,
     GGML_OP_FLASH_FF,
+    GGML_OP_MAP_UNARY,
+    GGML_OP_MAP_BINARY,
     GGML_OP_COUNT,
 };
+// ggml object
+struct ggml_object {
+    size_t offs;
+    size_t size;
+    struct ggml_object * next;
+    char padding[8];
+};
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 // n-dimensional tensor
 struct ggml_tensor {
     enum ggml_type type;
@@ -335,6 +355,8 @@ int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
 float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+const char * ggml_type_name(enum ggml_type type);
 size_t ggml_element_size(const struct ggml_tensor * tensor);
 struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -344,13 +366,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
-bool ggml_mlock_supported(void);
-bool ggml_mlock(
-        struct ggml_context * ctx,
-        const void *opt_extra_addr,
-        size_t opt_extra_len,
-        char **err_p);
 struct ggml_tensor * ggml_new_tensor(
         struct ggml_context * ctx,
         enum   ggml_type type,
@@ -415,6 +430,12 @@ struct ggml_tensor * ggml_add(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b);
+struct ggml_tensor * ggml_add_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b);
 struct ggml_tensor * ggml_sub(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -519,6 +540,11 @@ struct ggml_tensor * ggml_cpy(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b);
+// make contiguous
+struct ggml_tensor * ggml_cont(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
 // return view(a), b specifies the new shape
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape(
@@ -638,6 +664,21 @@ struct ggml_tensor * ggml_flash_ff(
         struct ggml_tensor  * c0,
         struct ggml_tensor  * c1);
+// Mapping operations
+typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
+typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+struct ggml_tensor * ggml_map_unary_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun);
+struct ggml_tensor * ggml_map_binary_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun);
 //
 // automatic differentiation
 //
@@ -773,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
 int ggml_cpu_has_avx(void);
 int ggml_cpu_has_avx2(void);
 int ggml_cpu_has_avx512(void);
+int ggml_cpu_has_avx512_vbmi(void);
+int ggml_cpu_has_avx512_vnni(void);
 int ggml_cpu_has_fma(void);
 int ggml_cpu_has_neon(void);
 int ggml_cpu_has_arm_fma(void);
@@ -783,6 +826,31 @@ int ggml_cpu_has_blas(void);
 int ggml_cpu_has_sse3(void);
 int ggml_cpu_has_vsx(void);
+//
+// Internal types and functions exposed for tests and benchmarks
+//
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define GGML_RESTRICT
+#else
+#define GGML_RESTRICT restrict
+#endif
+typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+typedef struct {
+    dequantize_row_q_t dequantize_row_q;
+    quantize_row_q_t   quantize_row_q;
+    quantize_row_q_t   quantize_row_q_reference;
+    quantize_row_q_t   quantize_row_q_dot;
+    vec_dot_q_t        vec_dot_q;
+} quantize_fns_t;
+quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
 #ifdef  __cplusplus
 }
 #endif