RubyGems - llama_cpp - Versions diffs - 0.13.0 → 0.14.1 - Mend

llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +20 -0
data/ext/llama_cpp/llama_cpp.cpp +130 -26
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +15 -4
data/vendor/tmp/llama.cpp/Makefile +30 -15
data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
data/vendor/tmp/llama.cpp/ggml.c +734 -356
data/vendor/tmp/llama.cpp/ggml.h +91 -51
data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
data/vendor/tmp/llama.cpp/llama.h +53 -21
data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
data/vendor/tmp/llama.cpp/unicode.h +16 -774
metadata +4 -2

data/vendor/tmp/llama.cpp/ggml-impl.h CHANGED Viewed

@@ -53,26 +53,30 @@ extern "C" {
 //
 #include <arm_neon.h>
+typedef __fp16 ggml_fp16_internal_t;
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    __fp16 tmp;
+    ggml_fp16_internal_t tmp;
     memcpy(&tmp, &h, sizeof(ggml_fp16_t));
     return (float)tmp;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
     ggml_fp16_t res;
-    __fp16 tmp = f;
+    ggml_fp16_internal_t tmp = f;
     memcpy(&res, &tmp, sizeof(ggml_fp16_t));
     return res;
 }
 #else
+typedef uint16_t ggml_fp16_internal_t;
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else

data/vendor/tmp/llama.cpp/ggml-kompute.cpp CHANGED Viewed

@@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
     return ggml_backend_kompute_buffer_type(ctx->device);
 }
-static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
     ggml_vk_graph_compute(ctx, cgraph);
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
     /* .supports_op             = */ ggml_backend_kompute_supports_op,
+    /* .event_new               = */ NULL,
+    /* .event_free              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .event_synchronize       = */ NULL,
 };
 static ggml_guid_t ggml_backend_kompute_guid() {

data/vendor/tmp/llama.cpp/ggml-metal.m CHANGED Viewed

@@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
+    GGML_METAL_KERNEL_TYPE_ARANGE_F32,
+    GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
     GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
@@ -278,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
     id<MTLLibrary> metal_library;
     // load library
+    //
+    // - first check if the library is embedded
+    // - then check if the library is in the bundle
+    // - if not found, load the source and compile it
+    // - if that fails, return NULL
     {
         NSBundle * bundle = nil;
 #ifdef SWIFT_PACKAGE
@@ -285,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #else
         bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
         NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
-        if (libPath != nil) {
+#if GGML_METAL_EMBED_LIBRARY
+        const bool try_metallib = false;
+#else
+        const bool try_metallib = true;
+#endif
+        NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
+        if (try_metallib && path_lib != nil) {
             // pre-compiled library found
-            NSURL * libURL = [NSURL fileURLWithPath:libPath];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
+            NSURL * libURL = [NSURL fileURLWithPath:path_lib];
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
             metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
             if (error) {
                 GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@@ -303,38 +319,41 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
             extern const char ggml_metallib_start[];
             extern const char ggml_metallib_end[];
-            NSString * src  = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
+            NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
             GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
-            NSString * sourcePath;
-            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
+            NSString * path_source;
+            NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
-            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
+            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
-            if (ggmlMetalPathResources) {
-                sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
+            if (path_resource) {
+                path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
             } else {
-                sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+                path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
             }
-            if (sourcePath == nil) {
+            if (path_source == nil) {
                 GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
-                sourcePath = @"ggml-metal.metal";
+                path_source = @"ggml-metal.metal";
             }
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
-            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
+            NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
             if (error) {
                 GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                 return NULL;
             }
-#endif
+#endif // GGML_METAL_EMBED_LIBRARY
             @autoreleasepool {
                 // dictionary of preprocessor macros
                 NSMutableDictionary * prep = [NSMutableDictionary dictionary];
 #ifdef GGML_QKK_64
-                prep[@"QK_K"] = @(64);
+                prep[@"GGML_QKK_64"] = @(1);
 #endif
                 MTLCompileOptions* options = [MTLCompileOptions new];
@@ -569,6 +588,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                im2col_f32,             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,               upscale_f32,            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                   pad_f32,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,    timestep_embedding_f32, true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                arange_f32,             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,       argsort_f32_i32_asc,    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,      argsort_f32_i32_desc,   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,         true);
@@ -697,6 +718,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
             return false;
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
+        case GGML_OP_ARANGE:
+        case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
             return true;
@@ -742,7 +765,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
     }
 }
-static bool ggml_metal_graph_compute(
+static enum ggml_status ggml_metal_graph_compute(
         struct ggml_metal_context * ctx,
                struct ggml_cgraph * gf) {
@@ -1091,7 +1114,8 @@ static bool ggml_metal_graph_compute(
                     {
                         GGML_ASSERT(ggml_is_contiguous(src0));
-                        const float scale = *(const float *) dst->op_params;
+                        float scale;
+                        memcpy(&scale, dst->op_params, sizeof(scale));
                         int64_t n = ggml_nelements(dst);
@@ -1250,11 +1274,15 @@ static bool ggml_metal_graph_compute(
                             pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
                         }
-                        const float scale    = ((float *) dst->op_params)[0];
-                        const float max_bias = ((float *) dst->op_params)[1];
+                        float scale;
+                        float max_bias;
+                        memcpy(&scale,    ((int32_t *) dst->op_params) + 0, sizeof(scale));
+                        memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
                         const int64_t nrows_x = ggml_nrows(src0);
                         const int64_t nrows_y = src0->ne[1];
                         const uint32_t n_head_kv   = nrows_x/nrows_y;
                         const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
@@ -1631,8 +1659,8 @@ static bool ggml_metal_graph_compute(
                         // TODO: make this more general
                         GGML_ASSERT(n_as <= 8);
-                        // max size of the src1ids array in the kernel stack
-                        GGML_ASSERT(ne11 <= 512);
+                        // max size of the src1ids array in the kernel shared buffer
+                        GGML_ASSERT(ne11 <= 4096);
                         const int64_t  ne20 = src2 ? src2->ne[0] : 0;
                         const int64_t  ne21 = src2 ? src2->ne[1] : 0;
@@ -1730,7 +1758,7 @@ static bool ggml_metal_graph_compute(
                                 [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
                             }
-                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                            [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
                             [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                         } else {
@@ -2086,6 +2114,7 @@ static bool ggml_metal_graph_compute(
                         //const int n_past = ((int32_t *) dst->op_params)[0];
                         const int n_head = ((int32_t *) dst->op_params)[1];
                         float max_bias;
                         memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
@@ -2300,6 +2329,50 @@ static bool ggml_metal_graph_compute(
                         [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                     } break;
+                case GGML_OP_ARANGE:
+                    {
+                        GGML_ASSERT(dst->type == GGML_TYPE_F32);
+                        float start;
+                        float step;
+                        memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
+                        memcpy(&step,  ((int32_t *) dst->op_params) + 2, sizeof(float));
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:0];
+                        [encoder setBytes:&ne0   length:sizeof(ne0)   atIndex:1];
+                        [encoder setBytes:&start length:sizeof(start) atIndex:2];
+                        [encoder setBytes:&step  length:sizeof(step)  atIndex:3];
+                        const int nth = MIN(1024, ne0);
+                        [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_TIMESTEP_EMBEDDING:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                        const int dim        = dst->op_params[0];
+                        const int max_period = dst->op_params[1];
+                        const int half = dim / 2;
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&nb1   length:sizeof(nb1) atIndex:2];
+                        [encoder setBytes:&dim   length:sizeof(dim) atIndex:3];
+                        [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
+                        const int nth = MIN(1024, half);
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
                 case GGML_OP_ARGSORT:
                     {
                         GGML_ASSERT(src0->type == GGML_TYPE_F32);
@@ -2428,7 +2501,7 @@ static bool ggml_metal_graph_compute(
         MTLCommandBufferStatus status = [command_buffer status];
         if (status != MTLCommandBufferStatusCompleted) {
             GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-            return false;
+            return GGML_STATUS_FAILED;
         }
     }
@@ -2437,7 +2510,7 @@ static bool ggml_metal_graph_compute(
     }
     }
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 ////////////////////////////////////////////////////////////////////////////////
@@ -2739,7 +2812,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
     UNUSED(backend);
 }
-GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
     return ggml_metal_graph_compute(metal_ctx, cgraph);
@@ -2764,6 +2837,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
     /* .supports_op             = */ ggml_backend_metal_supports_op,
+    /* .event_new               = */ NULL,
+    /* .event_free              = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .event_synchronize       = */ NULL,
 };
 void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {