llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -53,26 +53,30 @@ extern "C" {
|
|
53
53
|
//
|
54
54
|
#include <arm_neon.h>
|
55
55
|
|
56
|
+
typedef __fp16 ggml_fp16_internal_t;
|
57
|
+
|
56
58
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
57
59
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
58
60
|
|
59
61
|
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
60
62
|
|
61
63
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
62
|
-
|
64
|
+
ggml_fp16_internal_t tmp;
|
63
65
|
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
64
66
|
return (float)tmp;
|
65
67
|
}
|
66
68
|
|
67
69
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
68
70
|
ggml_fp16_t res;
|
69
|
-
|
71
|
+
ggml_fp16_internal_t tmp = f;
|
70
72
|
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
71
73
|
return res;
|
72
74
|
}
|
73
75
|
|
74
76
|
#else
|
75
77
|
|
78
|
+
typedef uint16_t ggml_fp16_internal_t;
|
79
|
+
|
76
80
|
#ifdef __wasm_simd128__
|
77
81
|
#include <wasm_simd128.h>
|
78
82
|
#else
|
@@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
|
|
1927
1927
|
return ggml_backend_kompute_buffer_type(ctx->device);
|
1928
1928
|
}
|
1929
1929
|
|
1930
|
-
static
|
1930
|
+
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
1931
1931
|
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
1932
1932
|
ggml_vk_graph_compute(ctx, cgraph);
|
1933
|
-
return
|
1933
|
+
return GGML_STATUS_SUCCESS;
|
1934
1934
|
}
|
1935
1935
|
|
1936
1936
|
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
1951
1951
|
/* .graph_plan_compute = */ NULL,
|
1952
1952
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
1953
1953
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
1954
|
+
/* .event_new = */ NULL,
|
1955
|
+
/* .event_free = */ NULL,
|
1956
|
+
/* .event_record = */ NULL,
|
1957
|
+
/* .event_wait = */ NULL,
|
1958
|
+
/* .event_synchronize = */ NULL,
|
1954
1959
|
};
|
1955
1960
|
|
1956
1961
|
static ggml_guid_t ggml_backend_kompute_guid() {
|
@@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
|
|
163
163
|
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
164
164
|
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
165
165
|
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
166
|
+
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
167
|
+
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
166
168
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
167
169
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
|
168
170
|
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
|
@@ -278,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
278
280
|
id<MTLLibrary> metal_library;
|
279
281
|
|
280
282
|
// load library
|
283
|
+
//
|
284
|
+
// - first check if the library is embedded
|
285
|
+
// - then check if the library is in the bundle
|
286
|
+
// - if not found, load the source and compile it
|
287
|
+
// - if that fails, return NULL
|
281
288
|
{
|
282
289
|
NSBundle * bundle = nil;
|
283
290
|
#ifdef SWIFT_PACKAGE
|
@@ -285,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
285
292
|
#else
|
286
293
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
287
294
|
#endif
|
295
|
+
|
288
296
|
NSError * error = nil;
|
289
|
-
|
290
|
-
|
297
|
+
|
298
|
+
#if GGML_METAL_EMBED_LIBRARY
|
299
|
+
const bool try_metallib = false;
|
300
|
+
#else
|
301
|
+
const bool try_metallib = true;
|
302
|
+
#endif
|
303
|
+
|
304
|
+
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
305
|
+
if (try_metallib && path_lib != nil) {
|
291
306
|
// pre-compiled library found
|
292
|
-
NSURL * libURL = [NSURL fileURLWithPath:
|
293
|
-
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [
|
307
|
+
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
308
|
+
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
309
|
+
|
294
310
|
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
|
295
311
|
if (error) {
|
296
312
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
@@ -303,38 +319,41 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
303
319
|
extern const char ggml_metallib_start[];
|
304
320
|
extern const char ggml_metallib_end[];
|
305
321
|
|
306
|
-
NSString * src
|
322
|
+
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
307
323
|
#else
|
308
324
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
309
325
|
|
310
|
-
NSString *
|
311
|
-
NSString *
|
326
|
+
NSString * path_source;
|
327
|
+
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
312
328
|
|
313
|
-
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__,
|
329
|
+
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
314
330
|
|
315
|
-
if (
|
316
|
-
|
331
|
+
if (path_resource) {
|
332
|
+
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
317
333
|
} else {
|
318
|
-
|
334
|
+
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
319
335
|
}
|
320
|
-
|
336
|
+
|
337
|
+
if (path_source == nil) {
|
321
338
|
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
322
|
-
|
339
|
+
path_source = @"ggml-metal.metal";
|
323
340
|
}
|
324
|
-
|
325
|
-
|
341
|
+
|
342
|
+
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
343
|
+
|
344
|
+
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
326
345
|
if (error) {
|
327
346
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
328
347
|
return NULL;
|
329
348
|
}
|
330
|
-
#endif
|
349
|
+
#endif // GGML_METAL_EMBED_LIBRARY
|
331
350
|
|
332
351
|
@autoreleasepool {
|
333
352
|
// dictionary of preprocessor macros
|
334
353
|
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
335
354
|
|
336
355
|
#ifdef GGML_QKK_64
|
337
|
-
prep[@"
|
356
|
+
prep[@"GGML_QKK_64"] = @(1);
|
338
357
|
#endif
|
339
358
|
|
340
359
|
MTLCompileOptions* options = [MTLCompileOptions new];
|
@@ -569,6 +588,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
569
588
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
570
589
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
571
590
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
591
|
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
592
|
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
572
593
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
573
594
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
574
595
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
@@ -697,6 +718,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
697
718
|
return false;
|
698
719
|
case GGML_OP_UPSCALE:
|
699
720
|
case GGML_OP_PAD:
|
721
|
+
case GGML_OP_ARANGE:
|
722
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
700
723
|
case GGML_OP_ARGSORT:
|
701
724
|
case GGML_OP_LEAKY_RELU:
|
702
725
|
return true;
|
@@ -742,7 +765,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
742
765
|
}
|
743
766
|
}
|
744
767
|
|
745
|
-
static
|
768
|
+
static enum ggml_status ggml_metal_graph_compute(
|
746
769
|
struct ggml_metal_context * ctx,
|
747
770
|
struct ggml_cgraph * gf) {
|
748
771
|
|
@@ -1091,7 +1114,8 @@ static bool ggml_metal_graph_compute(
|
|
1091
1114
|
{
|
1092
1115
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
1093
1116
|
|
1094
|
-
|
1117
|
+
float scale;
|
1118
|
+
memcpy(&scale, dst->op_params, sizeof(scale));
|
1095
1119
|
|
1096
1120
|
int64_t n = ggml_nelements(dst);
|
1097
1121
|
|
@@ -1250,11 +1274,15 @@ static bool ggml_metal_graph_compute(
|
|
1250
1274
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
|
1251
1275
|
}
|
1252
1276
|
|
1253
|
-
|
1254
|
-
|
1277
|
+
float scale;
|
1278
|
+
float max_bias;
|
1279
|
+
|
1280
|
+
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
|
1281
|
+
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
|
1255
1282
|
|
1256
1283
|
const int64_t nrows_x = ggml_nrows(src0);
|
1257
1284
|
const int64_t nrows_y = src0->ne[1];
|
1285
|
+
|
1258
1286
|
const uint32_t n_head_kv = nrows_x/nrows_y;
|
1259
1287
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
1260
1288
|
|
@@ -1631,8 +1659,8 @@ static bool ggml_metal_graph_compute(
|
|
1631
1659
|
// TODO: make this more general
|
1632
1660
|
GGML_ASSERT(n_as <= 8);
|
1633
1661
|
|
1634
|
-
// max size of the src1ids array in the kernel
|
1635
|
-
GGML_ASSERT(ne11 <=
|
1662
|
+
// max size of the src1ids array in the kernel shared buffer
|
1663
|
+
GGML_ASSERT(ne11 <= 4096);
|
1636
1664
|
|
1637
1665
|
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
1638
1666
|
const int64_t ne21 = src2 ? src2->ne[1] : 0;
|
@@ -1730,7 +1758,7 @@ static bool ggml_metal_graph_compute(
|
|
1730
1758
|
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
|
1731
1759
|
}
|
1732
1760
|
|
1733
|
-
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
1761
|
+
[encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
|
1734
1762
|
|
1735
1763
|
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
1736
1764
|
} else {
|
@@ -2086,6 +2114,7 @@ static bool ggml_metal_graph_compute(
|
|
2086
2114
|
|
2087
2115
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
2088
2116
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
2117
|
+
|
2089
2118
|
float max_bias;
|
2090
2119
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
2091
2120
|
|
@@ -2300,6 +2329,50 @@ static bool ggml_metal_graph_compute(
|
|
2300
2329
|
|
2301
2330
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2302
2331
|
} break;
|
2332
|
+
case GGML_OP_ARANGE:
|
2333
|
+
{
|
2334
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
2335
|
+
|
2336
|
+
float start;
|
2337
|
+
float step;
|
2338
|
+
|
2339
|
+
memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
2340
|
+
memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float));
|
2341
|
+
|
2342
|
+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
|
2343
|
+
|
2344
|
+
[encoder setComputePipelineState:pipeline];
|
2345
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
|
2346
|
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
|
2347
|
+
[encoder setBytes:&start length:sizeof(start) atIndex:2];
|
2348
|
+
[encoder setBytes:&step length:sizeof(step) atIndex:3];
|
2349
|
+
|
2350
|
+
const int nth = MIN(1024, ne0);
|
2351
|
+
|
2352
|
+
[encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2353
|
+
} break;
|
2354
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
2355
|
+
{
|
2356
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
2357
|
+
|
2358
|
+
const int dim = dst->op_params[0];
|
2359
|
+
const int max_period = dst->op_params[1];
|
2360
|
+
|
2361
|
+
const int half = dim / 2;
|
2362
|
+
|
2363
|
+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
|
2364
|
+
|
2365
|
+
[encoder setComputePipelineState:pipeline];
|
2366
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
2367
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
2368
|
+
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
|
2369
|
+
[encoder setBytes:&dim length:sizeof(dim) atIndex:3];
|
2370
|
+
[encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
|
2371
|
+
|
2372
|
+
const int nth = MIN(1024, half);
|
2373
|
+
|
2374
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2375
|
+
} break;
|
2303
2376
|
case GGML_OP_ARGSORT:
|
2304
2377
|
{
|
2305
2378
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
@@ -2428,7 +2501,7 @@ static bool ggml_metal_graph_compute(
|
|
2428
2501
|
MTLCommandBufferStatus status = [command_buffer status];
|
2429
2502
|
if (status != MTLCommandBufferStatusCompleted) {
|
2430
2503
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
2431
|
-
return
|
2504
|
+
return GGML_STATUS_FAILED;
|
2432
2505
|
}
|
2433
2506
|
}
|
2434
2507
|
|
@@ -2437,7 +2510,7 @@ static bool ggml_metal_graph_compute(
|
|
2437
2510
|
}
|
2438
2511
|
|
2439
2512
|
}
|
2440
|
-
return
|
2513
|
+
return GGML_STATUS_SUCCESS;
|
2441
2514
|
}
|
2442
2515
|
|
2443
2516
|
////////////////////////////////////////////////////////////////////////////////
|
@@ -2739,7 +2812,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
|
|
2739
2812
|
UNUSED(backend);
|
2740
2813
|
}
|
2741
2814
|
|
2742
|
-
GGML_CALL static
|
2815
|
+
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
2743
2816
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
2744
2817
|
|
2745
2818
|
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
@@ -2764,6 +2837,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
2764
2837
|
/* .graph_plan_compute = */ NULL,
|
2765
2838
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
2766
2839
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
2840
|
+
/* .event_new = */ NULL,
|
2841
|
+
/* .event_free = */ NULL,
|
2842
|
+
/* .event_record = */ NULL,
|
2843
|
+
/* .event_wait = */ NULL,
|
2844
|
+
/* .event_synchronize = */ NULL,
|
2767
2845
|
};
|
2768
2846
|
|
2769
2847
|
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|