llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -53,26 +53,30 @@ extern "C" {
|
|
53
53
|
//
|
54
54
|
#include <arm_neon.h>
|
55
55
|
|
56
|
+
typedef __fp16 ggml_fp16_internal_t;
|
57
|
+
|
56
58
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
57
59
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
58
60
|
|
59
61
|
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
60
62
|
|
61
63
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
62
|
-
|
64
|
+
ggml_fp16_internal_t tmp;
|
63
65
|
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
64
66
|
return (float)tmp;
|
65
67
|
}
|
66
68
|
|
67
69
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
68
70
|
ggml_fp16_t res;
|
69
|
-
|
71
|
+
ggml_fp16_internal_t tmp = f;
|
70
72
|
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
71
73
|
return res;
|
72
74
|
}
|
73
75
|
|
74
76
|
#else
|
75
77
|
|
78
|
+
typedef uint16_t ggml_fp16_internal_t;
|
79
|
+
|
76
80
|
#ifdef __wasm_simd128__
|
77
81
|
#include <wasm_simd128.h>
|
78
82
|
#else
|
@@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
|
|
1927
1927
|
return ggml_backend_kompute_buffer_type(ctx->device);
|
1928
1928
|
}
|
1929
1929
|
|
1930
|
-
static
|
1930
|
+
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
1931
1931
|
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
|
1932
1932
|
ggml_vk_graph_compute(ctx, cgraph);
|
1933
|
-
return
|
1933
|
+
return GGML_STATUS_SUCCESS;
|
1934
1934
|
}
|
1935
1935
|
|
1936
1936
|
static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
1951
1951
|
/* .graph_plan_compute = */ NULL,
|
1952
1952
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
1953
1953
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
1954
|
+
/* .event_new = */ NULL,
|
1955
|
+
/* .event_free = */ NULL,
|
1956
|
+
/* .event_record = */ NULL,
|
1957
|
+
/* .event_wait = */ NULL,
|
1958
|
+
/* .event_synchronize = */ NULL,
|
1954
1959
|
};
|
1955
1960
|
|
1956
1961
|
static ggml_guid_t ggml_backend_kompute_guid() {
|
@@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
|
|
163
163
|
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
164
164
|
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
165
165
|
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
166
|
+
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
167
|
+
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
166
168
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
167
169
|
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
|
168
170
|
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
|
@@ -278,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
278
280
|
id<MTLLibrary> metal_library;
|
279
281
|
|
280
282
|
// load library
|
283
|
+
//
|
284
|
+
// - first check if the library is embedded
|
285
|
+
// - then check if the library is in the bundle
|
286
|
+
// - if not found, load the source and compile it
|
287
|
+
// - if that fails, return NULL
|
281
288
|
{
|
282
289
|
NSBundle * bundle = nil;
|
283
290
|
#ifdef SWIFT_PACKAGE
|
@@ -285,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
285
292
|
#else
|
286
293
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
287
294
|
#endif
|
295
|
+
|
288
296
|
NSError * error = nil;
|
289
|
-
|
290
|
-
|
297
|
+
|
298
|
+
#if GGML_METAL_EMBED_LIBRARY
|
299
|
+
const bool try_metallib = false;
|
300
|
+
#else
|
301
|
+
const bool try_metallib = true;
|
302
|
+
#endif
|
303
|
+
|
304
|
+
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
305
|
+
if (try_metallib && path_lib != nil) {
|
291
306
|
// pre-compiled library found
|
292
|
-
NSURL * libURL = [NSURL fileURLWithPath:
|
293
|
-
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [
|
307
|
+
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
308
|
+
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
309
|
+
|
294
310
|
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
|
295
311
|
if (error) {
|
296
312
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
@@ -303,38 +319,41 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
303
319
|
extern const char ggml_metallib_start[];
|
304
320
|
extern const char ggml_metallib_end[];
|
305
321
|
|
306
|
-
NSString * src
|
322
|
+
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
307
323
|
#else
|
308
324
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
309
325
|
|
310
|
-
NSString *
|
311
|
-
NSString *
|
326
|
+
NSString * path_source;
|
327
|
+
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
312
328
|
|
313
|
-
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__,
|
329
|
+
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
314
330
|
|
315
|
-
if (
|
316
|
-
|
331
|
+
if (path_resource) {
|
332
|
+
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
317
333
|
} else {
|
318
|
-
|
334
|
+
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
319
335
|
}
|
320
|
-
|
336
|
+
|
337
|
+
if (path_source == nil) {
|
321
338
|
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
322
|
-
|
339
|
+
path_source = @"ggml-metal.metal";
|
323
340
|
}
|
324
|
-
|
325
|
-
|
341
|
+
|
342
|
+
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
343
|
+
|
344
|
+
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
326
345
|
if (error) {
|
327
346
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
328
347
|
return NULL;
|
329
348
|
}
|
330
|
-
#endif
|
349
|
+
#endif // GGML_METAL_EMBED_LIBRARY
|
331
350
|
|
332
351
|
@autoreleasepool {
|
333
352
|
// dictionary of preprocessor macros
|
334
353
|
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
335
354
|
|
336
355
|
#ifdef GGML_QKK_64
|
337
|
-
prep[@"
|
356
|
+
prep[@"GGML_QKK_64"] = @(1);
|
338
357
|
#endif
|
339
358
|
|
340
359
|
MTLCompileOptions* options = [MTLCompileOptions new];
|
@@ -569,6 +588,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
569
588
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
570
589
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
571
590
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
591
|
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
592
|
+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
572
593
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
573
594
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
574
595
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
@@ -697,6 +718,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
697
718
|
return false;
|
698
719
|
case GGML_OP_UPSCALE:
|
699
720
|
case GGML_OP_PAD:
|
721
|
+
case GGML_OP_ARANGE:
|
722
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
700
723
|
case GGML_OP_ARGSORT:
|
701
724
|
case GGML_OP_LEAKY_RELU:
|
702
725
|
return true;
|
@@ -742,7 +765,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|
742
765
|
}
|
743
766
|
}
|
744
767
|
|
745
|
-
static
|
768
|
+
static enum ggml_status ggml_metal_graph_compute(
|
746
769
|
struct ggml_metal_context * ctx,
|
747
770
|
struct ggml_cgraph * gf) {
|
748
771
|
|
@@ -1091,7 +1114,8 @@ static bool ggml_metal_graph_compute(
|
|
1091
1114
|
{
|
1092
1115
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
1093
1116
|
|
1094
|
-
|
1117
|
+
float scale;
|
1118
|
+
memcpy(&scale, dst->op_params, sizeof(scale));
|
1095
1119
|
|
1096
1120
|
int64_t n = ggml_nelements(dst);
|
1097
1121
|
|
@@ -1250,11 +1274,15 @@ static bool ggml_metal_graph_compute(
|
|
1250
1274
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
|
1251
1275
|
}
|
1252
1276
|
|
1253
|
-
|
1254
|
-
|
1277
|
+
float scale;
|
1278
|
+
float max_bias;
|
1279
|
+
|
1280
|
+
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
|
1281
|
+
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
|
1255
1282
|
|
1256
1283
|
const int64_t nrows_x = ggml_nrows(src0);
|
1257
1284
|
const int64_t nrows_y = src0->ne[1];
|
1285
|
+
|
1258
1286
|
const uint32_t n_head_kv = nrows_x/nrows_y;
|
1259
1287
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
1260
1288
|
|
@@ -1631,8 +1659,8 @@ static bool ggml_metal_graph_compute(
|
|
1631
1659
|
// TODO: make this more general
|
1632
1660
|
GGML_ASSERT(n_as <= 8);
|
1633
1661
|
|
1634
|
-
// max size of the src1ids array in the kernel
|
1635
|
-
GGML_ASSERT(ne11 <=
|
1662
|
+
// max size of the src1ids array in the kernel shared buffer
|
1663
|
+
GGML_ASSERT(ne11 <= 4096);
|
1636
1664
|
|
1637
1665
|
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
1638
1666
|
const int64_t ne21 = src2 ? src2->ne[1] : 0;
|
@@ -1730,7 +1758,7 @@ static bool ggml_metal_graph_compute(
|
|
1730
1758
|
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
|
1731
1759
|
}
|
1732
1760
|
|
1733
|
-
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
1761
|
+
[encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
|
1734
1762
|
|
1735
1763
|
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
1736
1764
|
} else {
|
@@ -2086,6 +2114,7 @@ static bool ggml_metal_graph_compute(
|
|
2086
2114
|
|
2087
2115
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
2088
2116
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
2117
|
+
|
2089
2118
|
float max_bias;
|
2090
2119
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
2091
2120
|
|
@@ -2300,6 +2329,50 @@ static bool ggml_metal_graph_compute(
|
|
2300
2329
|
|
2301
2330
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2302
2331
|
} break;
|
2332
|
+
case GGML_OP_ARANGE:
|
2333
|
+
{
|
2334
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
2335
|
+
|
2336
|
+
float start;
|
2337
|
+
float step;
|
2338
|
+
|
2339
|
+
memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
2340
|
+
memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float));
|
2341
|
+
|
2342
|
+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
|
2343
|
+
|
2344
|
+
[encoder setComputePipelineState:pipeline];
|
2345
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:0];
|
2346
|
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
|
2347
|
+
[encoder setBytes:&start length:sizeof(start) atIndex:2];
|
2348
|
+
[encoder setBytes:&step length:sizeof(step) atIndex:3];
|
2349
|
+
|
2350
|
+
const int nth = MIN(1024, ne0);
|
2351
|
+
|
2352
|
+
[encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2353
|
+
} break;
|
2354
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
2355
|
+
{
|
2356
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
2357
|
+
|
2358
|
+
const int dim = dst->op_params[0];
|
2359
|
+
const int max_period = dst->op_params[1];
|
2360
|
+
|
2361
|
+
const int half = dim / 2;
|
2362
|
+
|
2363
|
+
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
|
2364
|
+
|
2365
|
+
[encoder setComputePipelineState:pipeline];
|
2366
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
2367
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
2368
|
+
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
|
2369
|
+
[encoder setBytes:&dim length:sizeof(dim) atIndex:3];
|
2370
|
+
[encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
|
2371
|
+
|
2372
|
+
const int nth = MIN(1024, half);
|
2373
|
+
|
2374
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
2375
|
+
} break;
|
2303
2376
|
case GGML_OP_ARGSORT:
|
2304
2377
|
{
|
2305
2378
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
@@ -2428,7 +2501,7 @@ static bool ggml_metal_graph_compute(
|
|
2428
2501
|
MTLCommandBufferStatus status = [command_buffer status];
|
2429
2502
|
if (status != MTLCommandBufferStatusCompleted) {
|
2430
2503
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
2431
|
-
return
|
2504
|
+
return GGML_STATUS_FAILED;
|
2432
2505
|
}
|
2433
2506
|
}
|
2434
2507
|
|
@@ -2437,7 +2510,7 @@ static bool ggml_metal_graph_compute(
|
|
2437
2510
|
}
|
2438
2511
|
|
2439
2512
|
}
|
2440
|
-
return
|
2513
|
+
return GGML_STATUS_SUCCESS;
|
2441
2514
|
}
|
2442
2515
|
|
2443
2516
|
////////////////////////////////////////////////////////////////////////////////
|
@@ -2739,7 +2812,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
|
|
2739
2812
|
UNUSED(backend);
|
2740
2813
|
}
|
2741
2814
|
|
2742
|
-
GGML_CALL static
|
2815
|
+
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
2743
2816
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
2744
2817
|
|
2745
2818
|
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
@@ -2764,6 +2837,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
2764
2837
|
/* .graph_plan_compute = */ NULL,
|
2765
2838
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
2766
2839
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
2840
|
+
/* .event_new = */ NULL,
|
2841
|
+
/* .event_free = */ NULL,
|
2842
|
+
/* .event_record = */ NULL,
|
2843
|
+
/* .event_wait = */ NULL,
|
2844
|
+
/* .event_synchronize = */ NULL,
|
2767
2845
|
};
|
2768
2846
|
|
2769
2847
|
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|