llama_cpp 0.14.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -53,26 +53,30 @@ extern "C" {
53
53
  //
54
54
  #include <arm_neon.h>
55
55
 
56
+ typedef __fp16 ggml_fp16_internal_t;
57
+
56
58
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
57
59
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
58
60
 
59
61
  #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
60
62
 
61
63
  static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
62
- __fp16 tmp;
64
+ ggml_fp16_internal_t tmp;
63
65
  memcpy(&tmp, &h, sizeof(ggml_fp16_t));
64
66
  return (float)tmp;
65
67
  }
66
68
 
67
69
  static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
68
70
  ggml_fp16_t res;
69
- __fp16 tmp = f;
71
+ ggml_fp16_internal_t tmp = f;
70
72
  memcpy(&res, &tmp, sizeof(ggml_fp16_t));
71
73
  return res;
72
74
  }
73
75
 
74
76
  #else
75
77
 
78
+ typedef uint16_t ggml_fp16_internal_t;
79
+
76
80
  #ifdef __wasm_simd128__
77
81
  #include <wasm_simd128.h>
78
82
  #else
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
1951
1951
  /* .graph_plan_compute = */ NULL,
1952
1952
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1953
1953
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1954
+ /* .event_new = */ NULL,
1955
+ /* .event_free = */ NULL,
1956
+ /* .event_record = */ NULL,
1957
+ /* .event_wait = */ NULL,
1958
+ /* .event_synchronize = */ NULL,
1954
1959
  };
1955
1960
 
1956
1961
  static ggml_guid_t ggml_backend_kompute_guid() {
@@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
280
280
  id<MTLLibrary> metal_library;
281
281
 
282
282
  // load library
283
+ //
284
+ // - first check if the library is embedded
285
+ // - then check if the library is in the bundle
286
+ // - if not found, load the source and compile it
287
+ // - if that fails, return NULL
283
288
  {
284
289
  NSBundle * bundle = nil;
285
290
  #ifdef SWIFT_PACKAGE
@@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
287
292
  #else
288
293
  bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
289
294
  #endif
295
+
290
296
  NSError * error = nil;
291
- NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
292
- if (libPath != nil) {
297
+
298
+ #if GGML_METAL_EMBED_LIBRARY
299
+ const bool try_metallib = false;
300
+ #else
301
+ const bool try_metallib = true;
302
+ #endif
303
+
304
+ NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
305
+ if (try_metallib && path_lib != nil) {
293
306
  // pre-compiled library found
294
- NSURL * libURL = [NSURL fileURLWithPath:libPath];
295
- GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
307
+ NSURL * libURL = [NSURL fileURLWithPath:path_lib];
308
+ GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
309
+
296
310
  metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
297
311
  if (error) {
298
312
  GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@@ -305,38 +319,41 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
305
319
  extern const char ggml_metallib_start[];
306
320
  extern const char ggml_metallib_end[];
307
321
 
308
- NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
322
+ NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
309
323
  #else
310
324
  GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
311
325
 
312
- NSString * sourcePath;
313
- NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
326
+ NSString * path_source;
327
+ NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
314
328
 
315
- GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
329
+ GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
316
330
 
317
- if (ggmlMetalPathResources) {
318
- sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
331
+ if (path_resource) {
332
+ path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
319
333
  } else {
320
- sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
334
+ path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
321
335
  }
322
- if (sourcePath == nil) {
336
+
337
+ if (path_source == nil) {
323
338
  GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
324
- sourcePath = @"ggml-metal.metal";
339
+ path_source = @"ggml-metal.metal";
325
340
  }
326
- GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
327
- NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
341
+
342
+ GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
343
+
344
+ NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
328
345
  if (error) {
329
346
  GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
330
347
  return NULL;
331
348
  }
332
- #endif
349
+ #endif // GGML_METAL_EMBED_LIBRARY
333
350
 
334
351
  @autoreleasepool {
335
352
  // dictionary of preprocessor macros
336
353
  NSMutableDictionary * prep = [NSMutableDictionary dictionary];
337
354
 
338
355
  #ifdef GGML_QKK_64
339
- prep[@"QK_K"] = @(64);
356
+ prep[@"GGML_QKK_64"] = @(1);
340
357
  #endif
341
358
 
342
359
  MTLCompileOptions* options = [MTLCompileOptions new];
@@ -1642,8 +1659,8 @@ static enum ggml_status ggml_metal_graph_compute(
1642
1659
  // TODO: make this more general
1643
1660
  GGML_ASSERT(n_as <= 8);
1644
1661
 
1645
- // max size of the src1ids array in the kernel stack
1646
- GGML_ASSERT(ne11 <= 512);
1662
+ // max size of the src1ids array in the kernel shared buffer
1663
+ GGML_ASSERT(ne11 <= 4096);
1647
1664
 
1648
1665
  const int64_t ne20 = src2 ? src2->ne[0] : 0;
1649
1666
  const int64_t ne21 = src2 ? src2->ne[1] : 0;
@@ -1741,7 +1758,7 @@ static enum ggml_status ggml_metal_graph_compute(
1741
1758
  [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
1742
1759
  }
1743
1760
 
1744
- [encoder setThreadgroupMemoryLength:8192 atIndex:0];
1761
+ [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
1745
1762
 
1746
1763
  [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
1747
1764
  } else {
@@ -2820,6 +2837,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
2820
2837
  /* .graph_plan_compute = */ NULL,
2821
2838
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2822
2839
  /* .supports_op = */ ggml_backend_metal_supports_op,
2840
+ /* .event_new = */ NULL,
2841
+ /* .event_free = */ NULL,
2842
+ /* .event_record = */ NULL,
2843
+ /* .event_wait = */ NULL,
2844
+ /* .event_synchronize = */ NULL,
2823
2845
  };
2824
2846
 
2825
2847
  void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {