llama_cpp 0.14.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +71 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +9 -0
- data/vendor/tmp/llama.cpp/Makefile +28 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +358 -135
- data/vendor/tmp/llama.cpp/ggml-backend.h +41 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +187 -1033
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +42 -20
- data/vendor/tmp/llama.cpp/ggml-metal.metal +44 -910
- data/vendor/tmp/llama.cpp/ggml-quants.c +457 -1074
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +388 -565
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +6 -39
- data/vendor/tmp/llama.cpp/ggml.c +509 -343
- data/vendor/tmp/llama.cpp/ggml.h +61 -47
- data/vendor/tmp/llama.cpp/llama.cpp +1446 -687
- data/vendor/tmp/llama.cpp/llama.h +25 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -53,26 +53,30 @@ extern "C" {
|
|
53
53
|
//
|
54
54
|
#include <arm_neon.h>
|
55
55
|
|
56
|
+
typedef __fp16 ggml_fp16_internal_t;
|
57
|
+
|
56
58
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
57
59
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
58
60
|
|
59
61
|
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
60
62
|
|
61
63
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
62
|
-
|
64
|
+
ggml_fp16_internal_t tmp;
|
63
65
|
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
64
66
|
return (float)tmp;
|
65
67
|
}
|
66
68
|
|
67
69
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
68
70
|
ggml_fp16_t res;
|
69
|
-
|
71
|
+
ggml_fp16_internal_t tmp = f;
|
70
72
|
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
71
73
|
return res;
|
72
74
|
}
|
73
75
|
|
74
76
|
#else
|
75
77
|
|
78
|
+
typedef uint16_t ggml_fp16_internal_t;
|
79
|
+
|
76
80
|
#ifdef __wasm_simd128__
|
77
81
|
#include <wasm_simd128.h>
|
78
82
|
#else
|
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
1951
1951
|
/* .graph_plan_compute = */ NULL,
|
1952
1952
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
1953
1953
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
1954
|
+
/* .event_new = */ NULL,
|
1955
|
+
/* .event_free = */ NULL,
|
1956
|
+
/* .event_record = */ NULL,
|
1957
|
+
/* .event_wait = */ NULL,
|
1958
|
+
/* .event_synchronize = */ NULL,
|
1954
1959
|
};
|
1955
1960
|
|
1956
1961
|
static ggml_guid_t ggml_backend_kompute_guid() {
|
@@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
280
280
|
id<MTLLibrary> metal_library;
|
281
281
|
|
282
282
|
// load library
|
283
|
+
//
|
284
|
+
// - first check if the library is embedded
|
285
|
+
// - then check if the library is in the bundle
|
286
|
+
// - if not found, load the source and compile it
|
287
|
+
// - if that fails, return NULL
|
283
288
|
{
|
284
289
|
NSBundle * bundle = nil;
|
285
290
|
#ifdef SWIFT_PACKAGE
|
@@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
287
292
|
#else
|
288
293
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
289
294
|
#endif
|
295
|
+
|
290
296
|
NSError * error = nil;
|
291
|
-
|
292
|
-
|
297
|
+
|
298
|
+
#if GGML_METAL_EMBED_LIBRARY
|
299
|
+
const bool try_metallib = false;
|
300
|
+
#else
|
301
|
+
const bool try_metallib = true;
|
302
|
+
#endif
|
303
|
+
|
304
|
+
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
305
|
+
if (try_metallib && path_lib != nil) {
|
293
306
|
// pre-compiled library found
|
294
|
-
NSURL * libURL = [NSURL fileURLWithPath:
|
295
|
-
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [
|
307
|
+
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
308
|
+
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
309
|
+
|
296
310
|
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
|
297
311
|
if (error) {
|
298
312
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
@@ -305,38 +319,41 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
305
319
|
extern const char ggml_metallib_start[];
|
306
320
|
extern const char ggml_metallib_end[];
|
307
321
|
|
308
|
-
NSString * src
|
322
|
+
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
309
323
|
#else
|
310
324
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
311
325
|
|
312
|
-
NSString *
|
313
|
-
NSString *
|
326
|
+
NSString * path_source;
|
327
|
+
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
314
328
|
|
315
|
-
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__,
|
329
|
+
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
316
330
|
|
317
|
-
if (
|
318
|
-
|
331
|
+
if (path_resource) {
|
332
|
+
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
319
333
|
} else {
|
320
|
-
|
334
|
+
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
321
335
|
}
|
322
|
-
|
336
|
+
|
337
|
+
if (path_source == nil) {
|
323
338
|
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
324
|
-
|
339
|
+
path_source = @"ggml-metal.metal";
|
325
340
|
}
|
326
|
-
|
327
|
-
|
341
|
+
|
342
|
+
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
343
|
+
|
344
|
+
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
328
345
|
if (error) {
|
329
346
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
330
347
|
return NULL;
|
331
348
|
}
|
332
|
-
#endif
|
349
|
+
#endif // GGML_METAL_EMBED_LIBRARY
|
333
350
|
|
334
351
|
@autoreleasepool {
|
335
352
|
// dictionary of preprocessor macros
|
336
353
|
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
337
354
|
|
338
355
|
#ifdef GGML_QKK_64
|
339
|
-
prep[@"
|
356
|
+
prep[@"GGML_QKK_64"] = @(1);
|
340
357
|
#endif
|
341
358
|
|
342
359
|
MTLCompileOptions* options = [MTLCompileOptions new];
|
@@ -1642,8 +1659,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
1642
1659
|
// TODO: make this more general
|
1643
1660
|
GGML_ASSERT(n_as <= 8);
|
1644
1661
|
|
1645
|
-
// max size of the src1ids array in the kernel
|
1646
|
-
GGML_ASSERT(ne11 <=
|
1662
|
+
// max size of the src1ids array in the kernel shared buffer
|
1663
|
+
GGML_ASSERT(ne11 <= 4096);
|
1647
1664
|
|
1648
1665
|
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
1649
1666
|
const int64_t ne21 = src2 ? src2->ne[1] : 0;
|
@@ -1741,7 +1758,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|
1741
1758
|
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
|
1742
1759
|
}
|
1743
1760
|
|
1744
|
-
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
1761
|
+
[encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
|
1745
1762
|
|
1746
1763
|
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
1747
1764
|
} else {
|
@@ -2820,6 +2837,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
2820
2837
|
/* .graph_plan_compute = */ NULL,
|
2821
2838
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
2822
2839
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
2840
|
+
/* .event_new = */ NULL,
|
2841
|
+
/* .event_free = */ NULL,
|
2842
|
+
/* .event_record = */ NULL,
|
2843
|
+
/* .event_wait = */ NULL,
|
2844
|
+
/* .event_synchronize = */ NULL,
|
2823
2845
|
};
|
2824
2846
|
|
2825
2847
|
void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|