llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -53,26 +53,30 @@ extern "C" {
53
53
  //
54
54
  #include <arm_neon.h>
55
55
 
56
+ typedef __fp16 ggml_fp16_internal_t;
57
+
56
58
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
57
59
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
58
60
 
59
61
  #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
60
62
 
61
63
  static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
62
- __fp16 tmp;
64
+ ggml_fp16_internal_t tmp;
63
65
  memcpy(&tmp, &h, sizeof(ggml_fp16_t));
64
66
  return (float)tmp;
65
67
  }
66
68
 
67
69
  static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
68
70
  ggml_fp16_t res;
69
- __fp16 tmp = f;
71
+ ggml_fp16_internal_t tmp = f;
70
72
  memcpy(&res, &tmp, sizeof(ggml_fp16_t));
71
73
  return res;
72
74
  }
73
75
 
74
76
  #else
75
77
 
78
+ typedef uint16_t ggml_fp16_internal_t;
79
+
76
80
  #ifdef __wasm_simd128__
77
81
  #include <wasm_simd128.h>
78
82
  #else
@@ -1927,10 +1927,10 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
1927
1927
  return ggml_backend_kompute_buffer_type(ctx->device);
1928
1928
  }
1929
1929
 
1930
- static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1930
+ static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1931
1931
  auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1932
1932
  ggml_vk_graph_compute(ctx, cgraph);
1933
- return true;
1933
+ return GGML_STATUS_SUCCESS;
1934
1934
  }
1935
1935
 
1936
1936
  static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -1951,6 +1951,11 @@ static struct ggml_backend_i kompute_backend_i = {
1951
1951
  /* .graph_plan_compute = */ NULL,
1952
1952
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1953
1953
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1954
+ /* .event_new = */ NULL,
1955
+ /* .event_free = */ NULL,
1956
+ /* .event_record = */ NULL,
1957
+ /* .event_wait = */ NULL,
1958
+ /* .event_synchronize = */ NULL,
1954
1959
  };
1955
1960
 
1956
1961
  static ggml_guid_t ggml_backend_kompute_guid() {
@@ -163,6 +163,8 @@ enum ggml_metal_kernel_type {
163
163
  GGML_METAL_KERNEL_TYPE_IM2COL_F32,
164
164
  GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
165
165
  GGML_METAL_KERNEL_TYPE_PAD_F32,
166
+ GGML_METAL_KERNEL_TYPE_ARANGE_F32,
167
+ GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
166
168
  GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
167
169
  GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
168
170
  GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
@@ -278,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
278
280
  id<MTLLibrary> metal_library;
279
281
 
280
282
  // load library
283
+ //
284
+ // - first check if the library is embedded
285
+ // - then check if the library is in the bundle
286
+ // - if not found, load the source and compile it
287
+ // - if that fails, return NULL
281
288
  {
282
289
  NSBundle * bundle = nil;
283
290
  #ifdef SWIFT_PACKAGE
@@ -285,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
285
292
  #else
286
293
  bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
287
294
  #endif
295
+
288
296
  NSError * error = nil;
289
- NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
290
- if (libPath != nil) {
297
+
298
+ #if GGML_METAL_EMBED_LIBRARY
299
+ const bool try_metallib = false;
300
+ #else
301
+ const bool try_metallib = true;
302
+ #endif
303
+
304
+ NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
305
+ if (try_metallib && path_lib != nil) {
291
306
  // pre-compiled library found
292
- NSURL * libURL = [NSURL fileURLWithPath:libPath];
293
- GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
307
+ NSURL * libURL = [NSURL fileURLWithPath:path_lib];
308
+ GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
309
+
294
310
  metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
295
311
  if (error) {
296
312
  GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@@ -303,38 +319,41 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
303
319
  extern const char ggml_metallib_start[];
304
320
  extern const char ggml_metallib_end[];
305
321
 
306
- NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
322
+ NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
307
323
  #else
308
324
  GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
309
325
 
310
- NSString * sourcePath;
311
- NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
326
+ NSString * path_source;
327
+ NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
312
328
 
313
- GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
329
+ GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
314
330
 
315
- if (ggmlMetalPathResources) {
316
- sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
331
+ if (path_resource) {
332
+ path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
317
333
  } else {
318
- sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
334
+ path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
319
335
  }
320
- if (sourcePath == nil) {
336
+
337
+ if (path_source == nil) {
321
338
  GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
322
- sourcePath = @"ggml-metal.metal";
339
+ path_source = @"ggml-metal.metal";
323
340
  }
324
- GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
325
- NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
341
+
342
+ GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
343
+
344
+ NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
326
345
  if (error) {
327
346
  GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
328
347
  return NULL;
329
348
  }
330
- #endif
349
+ #endif // GGML_METAL_EMBED_LIBRARY
331
350
 
332
351
  @autoreleasepool {
333
352
  // dictionary of preprocessor macros
334
353
  NSMutableDictionary * prep = [NSMutableDictionary dictionary];
335
354
 
336
355
  #ifdef GGML_QKK_64
337
- prep[@"QK_K"] = @(64);
356
+ prep[@"GGML_QKK_64"] = @(1);
338
357
  #endif
339
358
 
340
359
  MTLCompileOptions* options = [MTLCompileOptions new];
@@ -569,6 +588,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
569
588
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
570
589
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
571
590
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
591
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
592
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
572
593
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
573
594
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
574
595
  GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
@@ -697,6 +718,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
697
718
  return false;
698
719
  case GGML_OP_UPSCALE:
699
720
  case GGML_OP_PAD:
721
+ case GGML_OP_ARANGE:
722
+ case GGML_OP_TIMESTEP_EMBEDDING:
700
723
  case GGML_OP_ARGSORT:
701
724
  case GGML_OP_LEAKY_RELU:
702
725
  return true;
@@ -742,7 +765,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
742
765
  }
743
766
  }
744
767
 
745
- static bool ggml_metal_graph_compute(
768
+ static enum ggml_status ggml_metal_graph_compute(
746
769
  struct ggml_metal_context * ctx,
747
770
  struct ggml_cgraph * gf) {
748
771
 
@@ -1091,7 +1114,8 @@ static bool ggml_metal_graph_compute(
1091
1114
  {
1092
1115
  GGML_ASSERT(ggml_is_contiguous(src0));
1093
1116
 
1094
- const float scale = *(const float *) dst->op_params;
1117
+ float scale;
1118
+ memcpy(&scale, dst->op_params, sizeof(scale));
1095
1119
 
1096
1120
  int64_t n = ggml_nelements(dst);
1097
1121
 
@@ -1250,11 +1274,15 @@ static bool ggml_metal_graph_compute(
1250
1274
  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
1251
1275
  }
1252
1276
 
1253
- const float scale = ((float *) dst->op_params)[0];
1254
- const float max_bias = ((float *) dst->op_params)[1];
1277
+ float scale;
1278
+ float max_bias;
1279
+
1280
+ memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
1281
+ memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
1255
1282
 
1256
1283
  const int64_t nrows_x = ggml_nrows(src0);
1257
1284
  const int64_t nrows_y = src0->ne[1];
1285
+
1258
1286
  const uint32_t n_head_kv = nrows_x/nrows_y;
1259
1287
  const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
1260
1288
 
@@ -1631,8 +1659,8 @@ static bool ggml_metal_graph_compute(
1631
1659
  // TODO: make this more general
1632
1660
  GGML_ASSERT(n_as <= 8);
1633
1661
 
1634
- // max size of the src1ids array in the kernel stack
1635
- GGML_ASSERT(ne11 <= 512);
1662
+ // max size of the src1ids array in the kernel shared buffer
1663
+ GGML_ASSERT(ne11 <= 4096);
1636
1664
 
1637
1665
  const int64_t ne20 = src2 ? src2->ne[0] : 0;
1638
1666
  const int64_t ne21 = src2 ? src2->ne[1] : 0;
@@ -1730,7 +1758,7 @@ static bool ggml_metal_graph_compute(
1730
1758
  [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
1731
1759
  }
1732
1760
 
1733
- [encoder setThreadgroupMemoryLength:8192 atIndex:0];
1761
+ [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
1734
1762
 
1735
1763
  [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
1736
1764
  } else {
@@ -2086,6 +2114,7 @@ static bool ggml_metal_graph_compute(
2086
2114
 
2087
2115
  //const int n_past = ((int32_t *) dst->op_params)[0];
2088
2116
  const int n_head = ((int32_t *) dst->op_params)[1];
2117
+
2089
2118
  float max_bias;
2090
2119
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
2091
2120
 
@@ -2300,6 +2329,50 @@ static bool ggml_metal_graph_compute(
2300
2329
 
2301
2330
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2302
2331
  } break;
2332
+ case GGML_OP_ARANGE:
2333
+ {
2334
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
2335
+
2336
+ float start;
2337
+ float step;
2338
+
2339
+ memcpy(&start, ((int32_t *) dst->op_params) + 0, sizeof(float));
2340
+ memcpy(&step, ((int32_t *) dst->op_params) + 2, sizeof(float));
2341
+
2342
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
2343
+
2344
+ [encoder setComputePipelineState:pipeline];
2345
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:0];
2346
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:1];
2347
+ [encoder setBytes:&start length:sizeof(start) atIndex:2];
2348
+ [encoder setBytes:&step length:sizeof(step) atIndex:3];
2349
+
2350
+ const int nth = MIN(1024, ne0);
2351
+
2352
+ [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2353
+ } break;
2354
+ case GGML_OP_TIMESTEP_EMBEDDING:
2355
+ {
2356
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
2357
+
2358
+ const int dim = dst->op_params[0];
2359
+ const int max_period = dst->op_params[1];
2360
+
2361
+ const int half = dim / 2;
2362
+
2363
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
2364
+
2365
+ [encoder setComputePipelineState:pipeline];
2366
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
2367
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
2368
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:2];
2369
+ [encoder setBytes:&dim length:sizeof(dim) atIndex:3];
2370
+ [encoder setBytes:&max_period length:sizeof(max_period) atIndex:4];
2371
+
2372
+ const int nth = MIN(1024, half);
2373
+
2374
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
2375
+ } break;
2303
2376
  case GGML_OP_ARGSORT:
2304
2377
  {
2305
2378
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
@@ -2428,7 +2501,7 @@ static bool ggml_metal_graph_compute(
2428
2501
  MTLCommandBufferStatus status = [command_buffer status];
2429
2502
  if (status != MTLCommandBufferStatusCompleted) {
2430
2503
  GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
2431
- return false;
2504
+ return GGML_STATUS_FAILED;
2432
2505
  }
2433
2506
  }
2434
2507
 
@@ -2437,7 +2510,7 @@ static bool ggml_metal_graph_compute(
2437
2510
  }
2438
2511
 
2439
2512
  }
2440
- return true;
2513
+ return GGML_STATUS_SUCCESS;
2441
2514
  }
2442
2515
 
2443
2516
  ////////////////////////////////////////////////////////////////////////////////
@@ -2739,7 +2812,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
2739
2812
  UNUSED(backend);
2740
2813
  }
2741
2814
 
2742
- GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2815
+ GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2743
2816
  struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
2744
2817
 
2745
2818
  return ggml_metal_graph_compute(metal_ctx, cgraph);
@@ -2764,6 +2837,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
2764
2837
  /* .graph_plan_compute = */ NULL,
2765
2838
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2766
2839
  /* .supports_op = */ ggml_backend_metal_supports_op,
2840
+ /* .event_new = */ NULL,
2841
+ /* .event_free = */ NULL,
2842
+ /* .event_record = */ NULL,
2843
+ /* .event_wait = */ NULL,
2844
+ /* .event_synchronize = */ NULL,
2767
2845
  };
2768
2846
 
2769
2847
  void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {