llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,26 +53,30 @@ extern "C" {
53
53
  //
54
54
  #include <arm_neon.h>
55
55
 
56
+ typedef __fp16 ggml_fp16_internal_t;
57
+
56
58
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
57
59
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
58
60
 
59
61
  #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
60
62
 
61
63
  static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
62
- __fp16 tmp;
64
+ ggml_fp16_internal_t tmp;
63
65
  memcpy(&tmp, &h, sizeof(ggml_fp16_t));
64
66
  return (float)tmp;
65
67
  }
66
68
 
67
69
  static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
68
70
  ggml_fp16_t res;
69
- __fp16 tmp = f;
71
+ ggml_fp16_internal_t tmp = f;
70
72
  memcpy(&res, &tmp, sizeof(ggml_fp16_t));
71
73
  return res;
72
74
  }
73
75
 
74
76
  #else
75
77
 
78
+ typedef uint16_t ggml_fp16_internal_t;
79
+
76
80
  #ifdef __wasm_simd128__
77
81
  #include <wasm_simd128.h>
78
82
  #else
@@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
1927
1927
  return ggml_backend_kompute_buffer_type(ctx->device);
1928
1928
  }
1929
1929
 
1930
- static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1930
+ static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1931
1931
  auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1932
1932
  ggml_vk_graph_compute(ctx, cgraph);
1933
- return true;
1933
+ return GGML_STATUS_SUCCESS;
1934
1934
  }
1935
1935
 
1936
1936
  static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
1951
1951
  /* .graph_plan_compute = */ NULL,
1952
1952
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1953
1953
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1954
+ /* .event_new = */ NULL,
1955
+ /* .event_free = */ NULL,
1956
+ /* .event_record = */ NULL,
1957
+ /* .event_wait = */ NULL,
1958
+ /* .event_synchronize = */ NULL,
1954
1959
  };
1955
1960
 
1956
1961
  static ggml_guid_t ggml_backend_kompute_guid() {
@@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
163
163
  GGML_METAL_KERNEL_TYPE_IM2COL_F32,
164
164
  GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
165
165
  GGML_METAL_KERNEL_TYPE_PAD_F32,
166
+ GGML_METAL_KERNEL_TYPE_ARANGE_F32,
167
+ GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
166
168
  GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
167
169
  GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
168
170
  GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
@@ -278,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
278
280
  id<MTLLibrary> metal_library;
279
281
 
280
282
  // load library
283
+ //
284
+ // - first check if the library is embedded
285
+ // - then check if the library is in the bundle
286
+ // - if not found, load the source and compile it
287
+ // - if that fails, return NULL
281
288
  {
282
289
  NSBundle * bundle = nil;
283
290
  #ifdef SWIFT_PACKAGE
@@ -285,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
285
292
  #else
286
293
  bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
287
294
  #endif
295
+
288
296
  NSError * error = nil;
289
- NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
290
- if (libPath != nil) {
297
+
298
+ #if GGML_METAL_EMBED_LIBRARY
299
+ const bool try_metallib = false;
300
+ #else
301
+ const bool try_metallib = true;
302
+ #endif
303
+
304
+ NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
305
+ if (try_metallib && path_lib != nil) {
291
306
  // pre-compiled library found
292
- NSURL * libURL = [NSURL fileURLWithPath:libPath];
293
- GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
307
+ NSURL * libURL = [NSURL fileURLWithPath:path_lib];
308
+ GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
309
+
294
310
  metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
295
311
  if (error) {
296
312
  GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@@ -303,38 +319,41 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
303
319
  extern const char ggml_metallib_start[];
304
320
  extern const char ggml_metallib_end[];
305
321
 
306
- NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
322
+ NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
307
323
  #else
308
324
  GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
309
325
 
310
- NSString * sourcePath;
311
- NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
326
+ NSString * path_source;
327
+ NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
312
328
 
313
- GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
329
+ GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
314
330
 
315
- if (ggmlMetalPathResources) {
316
- sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
331
+ if (path_resource) {
332
+ path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
317
333
  } else {
318
- sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
334
+ path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
319
335
  }
320
- if (sourcePath == nil) {
336
+
337
+ if (path_source == nil) {
321
338
  GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
322
- sourcePath = @"ggml-metal.metal";
339
+ path_source = @"ggml-metal.metal";
323
340
  }
324
- GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
325
- NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
341
+
342
+ GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
343
+
344
+ NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
326
345
  if (error) {
327
346
  GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
328
347
  return NULL;
329
348
  }
330
- #endif
349
+ #endif // GGML_METAL_EMBED_LIBRARY
331
350
 
332
351
  @autoreleasepool {
333
352
  // dictionary of preprocessor macros
334
353
  NSMutableDictionary * prep = [NSMutableDictionary dictionary];
335
354
 
336
355
  #ifdef GGML_QKK_64
337
- prep[@"QK_K"] = @(64);
356
+ prep[@"GGML_QKK_64"] = @(1);
338
357
  #endif
339
358
 
340
359
  MTLCompileOptions* options = [MTLCompileOptions new];
@@ -569,6 +588,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
569
588
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
570
589
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
571
590
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
591
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
592
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
572
593
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
573
594
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
574
595
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
@@ -697,6 +718,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
697
718
  return false;
698
719
  case GGML_OP_UPSCALE:
699
720
  case GGML_OP_PAD:
721
+ case GGML_OP_ARANGE:
722
+ case GGML_OP_TIMESTEP_EMBEDDING:
700
723
  case GGML_OP_ARGSORT:
701
724
  case GGML_OP_LEAKY_RELU:
702
725
  return true;
@@ -742,7 +765,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
742
765
  }
743
766
  }
744
767
 
745
- static bool ggml_metal_graph_compute(
768
+ static enum ggml_status ggml_metal_graph_compute(
746
769
  struct ggml_metal_context * ctx,
747
770
  struct ggml_cgraph * gf) {
748
771
 
@@ -1091,7 +1114,8 @@ static bool ggml_metal_graph_compute(
1091
1114
  {
1092
1115
  GGML_ASSERT(ggml_is_contiguous(src0));
1093
1116
 
1094
- const float scale = *(const float *) dst->op_params;
1117
+ float scale;
1118
+ memcpy(&scale, dst->op_params, sizeof(scale));
1095
1119
 
1096
1120
  int64_t n = ggml_nelements(dst);
1097
1121
 
@@ -1250,11 +1274,15 @@ static bool ggml_metal_graph_compute(
1250
1274
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
1251
1275
  }
1252
1276
 
1253
- const float scale = ((float *) dst->op_params)[0];
1254
- const float max_bias = ((float *) dst->op_params)[1];
1277
+ float scale;
1278
+ float max_bias;
1279
+
1280
+ memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
1281
+ memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
1255
1282
 
1256
1283
  const int64_t nrows_x = ggml_nrows(src0);
1257
1284
  const int64_t nrows_y = src0->ne[1];
1285
+
1258
1286
  const uint32_t n_head_kv = nrows_x/nrows_y;
1259
1287
  const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
1260
1288
 
@@ -1631,8 +1659,8 @@ static bool ggml_metal_graph_compute(
1631
1659
  // TODO: make this more general
1632
1660
  GGML_ASSERT(n_as <= 8);
1633
1661
 
1634
- // max size of the src1ids array in the kernel stack
1635
- GGML_ASSERT(ne11 <= 512);
1662
+ // max size of the src1ids array in the kernel shared buffer
1663
+ GGML_ASSERT(ne11 <= 4096);
1636
1664
 
1637
1665
  const int64_t ne20 = src2 ? src2->ne[0] : 0;
1638
1666
  const int64_t ne21 = src2 ? src2->ne[1] : 0;
@@ -1730,7 +1758,7 @@ static bool ggml_metal_graph_compute(
1730
1758
  [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
1731
1759
  }
1732
1760
 
1733
- [encoder setThreadgroupMemoryLength:8192 atIndex:0];
1761
+ [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
1734
1762
 
1735
1763
  [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
1736
1764
  } else {
@@ -2086,6 +2114,7 @@ static bool ggml_metal_graph_compute(
2086
2114
 
2087
2115
  //const int n_past = ((int32_t *) dst->op_params)[0];
2088
2116
  const int n_head = ((int32_t *) dst->op_params)[1];
2117
+
2089
2118
  float max_bias;
2090
2119
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
2091
2120
 
@@ -2300,6 +2329,50 @@ static bool ggml_metal_graph_compute(
2300
2329
 
2301
2330
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2302
2331
  } break;
2332
+ case GGML_OP_ARANGE:
2333
+ {
2334
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
2335
+
2336
+ float start;
2337
+ float step;
2338
+
2339
+ memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
2340
+ memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float));
2341
+
2342
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
2343
+
2344
+ [encoder setComputePipelineState:pipeline];
2345
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:0];
2346
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
2347
+ [encoder setBytes:&start length:sizeof(start) atIndex:2];
2348
+ [encoder setBytes:&step length:sizeof(step) atIndex:3];
2349
+
2350
+ const int nth = MIN(1024, ne0);
2351
+
2352
+ [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2353
+ } break;
2354
+ case GGML_OP_TIMESTEP_EMBEDDING:
2355
+ {
2356
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
2357
+
2358
+ const int dim = dst->op_params[0];
2359
+ const int max_period = dst->op_params[1];
2360
+
2361
+ const int half = dim / 2;
2362
+
2363
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
2364
+
2365
+ [encoder setComputePipelineState:pipeline];
2366
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
2367
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
2368
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
2369
+ [encoder setBytes:&dim length:sizeof(dim) atIndex:3];
2370
+ [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
2371
+
2372
+ const int nth = MIN(1024, half);
2373
+
2374
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2375
+ } break;
2303
2376
  case GGML_OP_ARGSORT:
2304
2377
  {
2305
2378
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
@@ -2428,7 +2501,7 @@ static bool ggml_metal_graph_compute(
2428
2501
  MTLCommandBufferStatus status = [command_buffer status];
2429
2502
  if (status != MTLCommandBufferStatusCompleted) {
2430
2503
  GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
2431
- return false;
2504
+ return GGML_STATUS_FAILED;
2432
2505
  }
2433
2506
  }
2434
2507
 
@@ -2437,7 +2510,7 @@ static bool ggml_metal_graph_compute(
2437
2510
  }
2438
2511
 
2439
2512
  }
2440
- return true;
2513
+ return GGML_STATUS_SUCCESS;
2441
2514
  }
2442
2515
 
2443
2516
  ////////////////////////////////////////////////////////////////////////////////
@@ -2739,7 +2812,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
2739
2812
  UNUSED(backend);
2740
2813
  }
2741
2814
 
2742
- GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2815
+ GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2743
2816
  struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
2744
2817
 
2745
2818
  return ggml_metal_graph_compute(metal_ctx, cgraph);
@@ -2764,6 +2837,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
2764
2837
  /* .graph_plan_compute = */ NULL,
2765
2838
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2766
2839
  /* .supports_op = */ ggml_backend_metal_supports_op,
2840
+ /* .event_new = */ NULL,
2841
+ /* .event_free = */ NULL,
2842
+ /* .event_record = */ NULL,
2843
+ /* .event_wait = */ NULL,
2844
+ /* .event_synchronize = */ NULL,
2767
2845
  };
2768
2846
 
2769
2847
  void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {