RubyGems - llama_cpp - Versions diffs - 0.1.0 → 0.1.1 - Mend

llama_cpp 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/ext/llama_cpp/llama_cpp.cpp +93 -15
data/ext/llama_cpp/src/ggml-cuda.h +2 -0
data/ext/llama_cpp/src/ggml-opencl.c +85 -122
data/ext/llama_cpp/src/ggml.c +6268 -4208
data/ext/llama_cpp/src/ggml.h +205 -12
data/ext/llama_cpp/src/llama.cpp +159 -79
data/ext/llama_cpp/src/llama.h +10 -10
data/lib/llama_cpp/client.rb +1 -3
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +3 -4
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c4058abcb7afa897554fc75bb368caeea0e77429e01fb5f3a1949191c50f4de5
-  data.tar.gz: 9929e94c02b5d9c21379a9275f08668e835f91d3d7be3570a2da9ab4ecbe6ad1
+  metadata.gz: 33b146badd1bebdf9588e48c0adac1f9924a0653aa5ec806fdf5dd288ef665d8
+  data.tar.gz: 134606db2b9fb10b51fc82f410d6653a6481b828d9fd05390b1570d6e198526a
 SHA512:
-  metadata.gz: ab267defd1769e7bf4599da199f50a7c5cc2355d2281ab7fd2ccd1a5ef196b716350cf8df9522a9185d02c8c3ad6a5d0f46f271fad0951440ab9b3fab4019932
-  data.tar.gz: 16727a2ac2c68f7913749b656c26523e9eee0118b69ff06bbc0935f899eac1874f16395d9e72ed2caa853e9c61fb9f614ad5913fca623e356aa249308b2f3dda
+  metadata.gz: 462d9e00121408c7af3934b0a663b29f99d5ad28f60a3471155509463bf26a14792c484d1fdc6054460941ae011d39b510774e225ad4ec03d60ce20a1dfef667
+  data.tar.gz: 4bf447ac55bba2b62d204dc975528de6664fe53af89df8ba4aa4172d4dbff709ac5b14a944326be5c71d64baa2cde00b60f7ba5e916e1fb68123c595f74ce24f

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,15 @@
 ## [Unreleased]
+## [[0.1.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.0...v0.1.1)] - 2023-05-21
+- Add load_session_file method to Context
+- Add save_session_file method to Context
+**Breaking Changes**
+- Bump bundled llama.cpp from master-173d0e6 to master-6986c78
+  - bump LLAMA_FILE_VERSION to 2
 ## [[0.1.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.7...v0.1.0)] - 2023-05-20
 **Breaking Changes**

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -292,8 +292,6 @@ public:
     // rb_define_method(rb_cLLaMAContextParams, "initialize", RUBY_METHOD_FUNC(_llama_context_params_init), 0);
     rb_define_method(rb_cLLaMAContextParams, "n_ctx=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ctx), 1);
     rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
-    rb_define_method(rb_cLLaMAContextParams, "n_parts=", RUBY_METHOD_FUNC(_llama_context_params_set_n_parts), 1);
-    rb_define_method(rb_cLLaMAContextParams, "n_parts", RUBY_METHOD_FUNC(_llama_context_params_get_n_parts), 0);
     rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
     rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
     rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -331,18 +329,6 @@ private:
     return INT2NUM(ptr->params.n_ctx);
   };
-  // n_parts
-  static VALUE _llama_context_params_set_n_parts(VALUE self, VALUE n_parts) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    ptr->params.n_parts = NUM2INT(n_parts);
-    return INT2NUM(ptr->params.n_parts);
-  };
-  static VALUE _llama_context_params_get_n_parts(VALUE self) {
-    LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
-    return INT2NUM(ptr->params.n_parts);
-  };
   // seed
   static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
     LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -494,6 +480,8 @@ public:
     rb_define_method(rb_cLLaMAContext, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_context_apply_lora_from_file), -1);
     rb_define_method(rb_cLLaMAContext, "kv_cache_token_count", RUBY_METHOD_FUNC(_llama_context_kv_cache_token_count), 0);
     rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
+    rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
+    rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
     rb_define_method(rb_cLLaMAContext, "sample_repetition_penalty", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalty), -1);
     rb_define_method(rb_cLLaMAContext, "sample_frequency_and_presence_penalties", RUBY_METHOD_FUNC(_llama_context_sample_frequency_and_presence_penalties), -1);
     rb_define_method(rb_cLLaMAContext, "sample_softmax", RUBY_METHOD_FUNC(_llama_context_sample_softmax), 1);
@@ -870,6 +858,97 @@ private:
     return Qnil;
   };
+  static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
+    VALUE kw_args = Qnil;
+    ID kw_table[1] = { rb_intern("session_path") };
+    VALUE kw_values[1] = { Qundef };
+    VALUE candidates = Qnil;
+    VALUE last_n_tokens = Qnil;
+    rb_scan_args(argc, argv, ":", &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
+    if (!RB_TYPE_P(kw_values[0], T_STRING)) {
+      rb_raise(rb_eArgError, "session_path must be a String");
+      return Qnil;
+    }
+    VALUE filename = kw_values[0];
+    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
+    if (ctx_ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(rb_iv_get(self, "@params"));
+    const int n_ctx = prms_ptr->params.n_ctx;
+    std::vector<llama_token> session_tokens(n_ctx);
+    size_t n_token_count_out = 0;
+    try {
+      bool res = llama_load_session_file(ctx_ptr->ctx, StringValueCStr(filename), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
+      if (!res) {
+        rb_raise(rb_eRuntimeError, "Failed to load session file");
+        return Qnil;
+      }
+      session_tokens.resize(n_token_count_out);
+    } catch (const std::runtime_error& e) {
+      rb_raise(rb_eRuntimeError, "%s", e.what());
+      return Qnil;
+    }
+    VALUE ary_session_tokens = rb_ary_new2(n_token_count_out);
+    for (size_t i = 0; i < n_token_count_out; i++) {
+      rb_ary_store(ary_session_tokens, i, INT2NUM(session_tokens[i]));
+    }
+    RB_GC_GUARD(filename);
+    return ary_session_tokens;
+  }
+  static VALUE _llama_context_save_session_file(int argc, VALUE* argv, VALUE self) {
+    VALUE kw_args = Qnil;
+    ID kw_table[2] = { rb_intern("session_path"), rb_intern("session_tokens") };
+    VALUE kw_values[2] = { Qundef, Qundef };
+    VALUE candidates = Qnil;
+    VALUE last_n_tokens = Qnil;
+    rb_scan_args(argc, argv, ":", &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
+    if (!RB_TYPE_P(kw_values[0], T_STRING)) {
+      rb_raise(rb_eArgError, "session_path must be a String");
+      return Qnil;
+    }
+    if (!RB_TYPE_P(kw_values[1], T_ARRAY)) {
+      rb_raise(rb_eArgError, "session_tokens must be an Array");
+      return Qnil;
+    }
+    VALUE filename = kw_values[0];
+    const size_t sz_session_tokens = RARRAY_LEN(kw_values[1]);
+    std::vector<llama_token> session_tokens(sz_session_tokens);
+    for (size_t i = 0; i < sz_session_tokens; i++) {
+      session_tokens[i] = NUM2INT(rb_ary_entry(kw_values[1], i));
+    }
+    LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
+    if (ctx_ptr->ctx == NULL) {
+      rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
+      return Qnil;
+    }
+    bool res = llama_save_session_file(ctx_ptr->ctx, StringValueCStr(filename), session_tokens.data(), sz_session_tokens);
+    if (!res) {
+      rb_raise(rb_eRuntimeError, "Failed to save session file");
+      return Qnil;
+    }
+    RB_GC_GUARD(filename);
+    return Qnil;
+  }
   static VALUE _llama_context_sample_repetition_penalty(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[1] = { rb_intern("penalty") };
@@ -1411,7 +1490,6 @@ extern "C" void Init_llama_cpp(void) {
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16));
-  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_2", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_2));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q8_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q8_0));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_0));
   rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_1", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_1));

data/ext/llama_cpp/src/ggml-cuda.h CHANGED Viewed

@@ -14,6 +14,8 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
+void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
 #ifdef  __cplusplus
 }
 #endif

data/ext/llama_cpp/src/ggml-opencl.c CHANGED Viewed

@@ -12,129 +12,129 @@
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
 const char * clblast_dequant = MULTILINE_QUOTE(
+typedef uchar uint8_t;
+typedef int int32_t;
+typedef uint uint32_t;
+constant uint QK4_0 = 32;
 struct block_q4_0
 {
     float d;
-    uchar qs[16];
+    uint8_t qs[QK4_0 / 2];
 };
-__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = blocks[i].d;
+constant uint QK4_1 = 32;
+struct block_q4_1
+{
+    float d;
+    float m;
+    uint8_t qs[QK4_1 / 2];
+};
-    const uchar vi = blocks[i].qs[l];
+constant uint QK5_0 = 32;
+struct __attribute__ ((packed)) block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
-    const uint index = i*32 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
+constant uint QK5_1 = 32;
+struct block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
-struct block_q4_1
+constant uint QK8_0 = 32;
+struct block_q8_0
 {
     float d;
-    float m;
-    uchar qs[16];
+    uint8_t qs[QK8_0];
 };
-__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-    const float d = blocks[i].d;
-    const float m = blocks[i].m;
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
+    constant uint qk = QK4_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+    const float d = x[i].d;
-    const uchar vi = blocks[i].qs[l];
+    const int x0 = (x[i].qs[j] & 0xf) - 8;
+    const int x1 = (x[i].qs[j] >>  4) - 8;
-    const uint index = i*32 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
 }
-struct block_q4_2
-{
-    ushort d;
-    uchar qs[8];
-};
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
+    constant uint qk = QK4_1;
-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 16;
-    const uint l = get_local_id(0);
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
+    const float d = x[i].d;
+    const float m = x[i].m;
-    const uchar vi = blocks[i].qs[l];
+    const int x0 = (x[i].qs[j] & 0xf);
+    const int x1 = (x[i].qs[j] >>  4);
-    const uint index = i*16 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
 }
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
+    constant uint qk = QK5_0;
-struct block_q5_0
-{
-    float d;
-    uint qh;
-    uchar qs[16];
-};
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
-__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
+    const float d = vload_half(0, (__global half*) &x[i].d);
-    const float d = blocks[i].d;
+    uint32_t qh = x[i].qh;
-    const uchar vi = blocks[i].qs[l];
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-    const uint l2 = l * 2;
+    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-    const uint index = i*32 + l2;
-    result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
-    result[index + 1] = (((vi >>  4) | vh1) - 16)*d;
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
 }
-struct block_q5_1
-{
-    ushort d;
-    ushort m;
-    uint qh;
-    uchar qs[16];
-};
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
+    constant uint qk = QK5_1;
-__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-    const float m = vload_half(0, (__global half*) &blocks[i].m);
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    const float m = vload_half(0, (__global half*) &x[i].m);
-    const uchar vi = blocks[i].qs[l];
+    uint32_t qh = x[i].qh;
-    const uint l2 = l * 2;
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
+    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+    const int x1 = (x[i].qs[j] >>  4) | xh_1;
-    const uint index = i*32 + l2;
-    result[index + 0] = ((vi & 0xf) | vh0)*d + m;
-    result[index + 1] = ((vi >>  4) | vh1)*d + m;
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
 }
-struct block_q8_0
-{
-    float d;
-    char qs[32];
-};
-__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
+    constant uint qk = QK8_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
-    result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
+    const float d = x[i].d;
+    y[i*qk + j] = x[i].qs[j]*d;
 }
 );
@@ -148,26 +148,12 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
         }                                                                                       \
     } while (0)
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-typedef struct {
-    float d;                // delta
-    uint32_t qh;          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} cl_block_q5_0;
 static cl_platform_id platform;
 static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
@@ -238,8 +224,6 @@ void ggml_cl_init(void) {
     CL_CHECK(err, "clCreateKernel");
     kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
     CL_CHECK(err, "clCreateKernel");
-    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
-    CL_CHECK(err, "clCreateKernel");
     kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
     CL_CHECK(err, "clCreateKernel");
     kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
@@ -274,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
     cl_kernel kernel;
     size_t global = n * k, local, size_qb;
     bool dequant;
-    cl_block_q5_0* cl_host_b;
     switch (btype) {
     case GGML_TYPE_F32:
@@ -292,28 +275,11 @@ void ggml_cl_sgemm_wrapper(
         local = 16;
         size_qb = global * (sizeof(float) * 2 + local) / 32;
         break;
-    case GGML_TYPE_Q4_2:
-        dequant = true;
-        kernel = kernel_q4_2;
-        local = 8;
-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
-        break;
     case GGML_TYPE_Q5_0:
         dequant = true;
         kernel = kernel_q5_0;
         local = 16;
-        // For some reason OpenCL seems to be incapable of working with structs of size 22.
-        // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
-        // TODO Find the reason, fix and remove workaround.
-        const block_q5_0* b = (const block_q5_0*) host_b;
-        cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
-        for (size_t i = 0; i < global / 32; i++) {
-            cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
-            memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
-            memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
-        }
-        host_b = (const float*) cl_host_b;
-        size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
+        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
         break;
     case GGML_TYPE_Q5_1:
         dequant = true;
@@ -392,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
     clWaitForEvents(1, &ev_c);
     clReleaseEvent(ev_sgemm);
     clReleaseEvent(ev_c);
-    if (btype == GGML_TYPE_Q5_0) {
-        free((void*) cl_host_b);
-    }
 }