llama_cpp 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
98
98
 
99
99
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
100
100
 
101
+ GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
102
+
101
103
  GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
104
+
102
105
  GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
103
106
 
104
107
  // helper to check if the device supports a specific family
@@ -180,7 +180,15 @@ struct ggml_metal_context {
180
180
  @implementation GGMLMetalClass
181
181
  @end
182
182
 
183
- ggml_log_callback ggml_metal_log_callback = NULL;
183
+
184
+ static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
185
+ fprintf(stderr, "%s", msg);
186
+
187
+ UNUSED(level);
188
+ UNUSED(user_data);
189
+ }
190
+
191
+ ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
184
192
  void * ggml_metal_log_user_data = NULL;
185
193
 
186
194
  void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
@@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
607
615
  }
608
616
 
609
617
  // temporarily defined here for compatibility between ggml-backend and the old API
610
- struct ggml_backend_metal_buffer_context {
611
- void * data;
618
+
619
+ struct ggml_backend_metal_buffer {
620
+ void * data;
621
+ size_t size;
612
622
 
613
623
  id<MTLBuffer> metal;
614
624
  };
615
625
 
626
+ struct ggml_backend_metal_buffer_context {
627
+ void * all_data;
628
+ size_t all_size;
629
+ bool owned;
630
+
631
+ // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
632
+ int n_buffers;
633
+ struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
634
+ };
635
+
616
636
  // finds the Metal buffer that contains the tensor data on the GPU device
617
637
  // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
618
638
  // Metal buffer based on the host memory pointer
@@ -622,17 +642,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
622
642
 
623
643
  const int64_t tsize = ggml_nbytes(t);
624
644
 
645
+ ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
646
+
625
647
  // compatibility with ggml-backend
626
- if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
627
- struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
648
+ if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
649
+ struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
650
+
651
+ // find the view that contains the tensor fully
652
+ for (int i = 0; i < buf_ctx->n_buffers; ++i) {
653
+ const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
628
654
 
629
- const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
655
+ //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
656
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
657
+ *offs = (size_t) ioffs;
630
658
 
631
- GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
659
+ //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
660
+
661
+ return buf_ctx->buffers[i].metal;
662
+ }
663
+ }
632
664
 
633
- *offs = (size_t) ioffs;
665
+ GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
634
666
 
635
- return buf_ctx->metal;
667
+ return nil;
636
668
  }
637
669
 
638
670
  // find the view that contains the tensor fully
@@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute(
1261
1293
  {
1262
1294
  GGML_ASSERT(ggml_is_contiguous(src0));
1263
1295
 
1264
- const float scale = *(const float *) src1->data;
1296
+ const float scale = *(const float *) dst->op_params;
1265
1297
 
1266
1298
  int64_t n = ggml_nelements(dst);
1267
1299
 
@@ -1272,8 +1304,8 @@ void ggml_metal_graph_compute(
1272
1304
  [encoder setComputePipelineState:ctx->pipeline_scale];
1273
1305
  }
1274
1306
 
1275
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1276
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1307
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1308
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1277
1309
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
1278
1310
 
1279
1311
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
@@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute(
2361
2393
 
2362
2394
  // backend interface
2363
2395
 
2396
+ // default buffer
2364
2397
  static id<MTLDevice> g_backend_device = nil;
2365
2398
  static int g_backend_device_ref_count = 0;
2366
2399
 
@@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) {
2388
2421
  static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
2389
2422
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2390
2423
 
2391
- return ctx->data;
2424
+ return ctx->all_data;
2392
2425
  }
2393
2426
 
2394
2427
  static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2395
2428
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2396
2429
 
2397
- [ctx->metal release];
2430
+ for (int i = 0; i < ctx->n_buffers; i++) {
2431
+ [ctx->buffers[i].metal release];
2432
+ }
2398
2433
  ggml_backend_metal_free_device();
2399
2434
 
2400
- free(ctx->data);
2401
- free(ctx);
2435
+ if (ctx->owned) {
2436
+ free(ctx->all_data);
2437
+ }
2402
2438
 
2403
- UNUSED(buffer);
2439
+ free(ctx);
2404
2440
  }
2405
2441
 
2406
2442
  static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2407
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
2408
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
2409
-
2410
2443
  memcpy((char *)tensor->data + offset, data, size);
2411
2444
 
2412
2445
  UNUSED(buffer);
2413
2446
  }
2414
2447
 
2415
2448
  static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2416
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
2417
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
2418
-
2419
2449
  memcpy(data, (const char *)tensor->data + offset, size);
2420
2450
 
2421
2451
  UNUSED(buffer);
@@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
2433
2463
  UNUSED(buffer);
2434
2464
  }
2435
2465
 
2436
- static struct ggml_backend_buffer_i metal_backend_buffer_i = {
2466
+ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2467
+ struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2468
+
2469
+ memset(ctx->all_data, value, ctx->all_size);
2470
+ }
2471
+
2472
+ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
2437
2473
  /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
2438
2474
  /* .get_base = */ ggml_backend_metal_buffer_get_base,
2439
2475
  /* .init_tensor = */ NULL,
@@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
2441
2477
  /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
2442
2478
  /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
2443
2479
  /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
2480
+ /* .clear = */ ggml_backend_metal_buffer_clear,
2444
2481
  };
2445
2482
 
2483
+ // default buffer type
2484
+
2446
2485
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2447
2486
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2448
2487
 
@@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
2453
2492
  size_aligned += (size_page - (size_aligned % size_page));
2454
2493
  }
2455
2494
 
2456
- ctx->data = ggml_metal_host_malloc(size);
2457
- ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
2495
+ id<MTLDevice> device = ggml_backend_metal_get_device();
2496
+
2497
+ ctx->all_data = ggml_metal_host_malloc(size_aligned);
2498
+ ctx->all_size = size_aligned;
2499
+ ctx->owned = true;
2500
+ ctx->n_buffers = 1;
2501
+
2502
+ ctx->buffers[0].data = ctx->all_data;
2503
+ ctx->buffers[0].size = size;
2504
+ ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
2458
2505
  length:size_aligned
2459
2506
  options:MTLResourceStorageModeShared
2460
2507
  deallocator:nil];
2461
2508
 
2462
- return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
2509
+ if (ctx->buffers[0].metal == nil) {
2510
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
2511
+ free(ctx);
2512
+ ggml_backend_metal_free_device();
2513
+ return NULL;
2514
+ }
2515
+
2516
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
2517
+
2518
+
2519
+ #if TARGET_OS_OSX
2520
+ GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
2521
+ device.currentAllocatedSize / 1024.0 / 1024.0,
2522
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
2523
+
2524
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
2525
+ GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
2526
+ } else {
2527
+ GGML_METAL_LOG_INFO("\n");
2528
+ }
2529
+ #else
2530
+ GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
2531
+ #endif
2532
+
2533
+
2534
+ return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
2463
2535
  }
2464
2536
 
2465
2537
  static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
2470
2542
  static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
2471
2543
  return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
2472
2544
 
2473
- GGML_UNUSED(buft);
2545
+ UNUSED(buft);
2546
+ }
2547
+
2548
+ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
2549
+ return true;
2550
+
2551
+ UNUSED(buft);
2474
2552
  }
2475
2553
 
2476
2554
  ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2480
2558
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
2481
2559
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
2482
2560
  /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
2561
+ /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
2483
2562
  },
2484
2563
  /* .context = */ NULL,
2485
2564
  };
@@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2487
2566
  return &ggml_backend_buffer_type_metal;
2488
2567
  }
2489
2568
 
2569
+ // buffer from ptr
2570
+
2571
+ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
2572
+ struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2573
+
2574
+ ctx->all_data = data;
2575
+ ctx->all_size = size;
2576
+ ctx->owned = false;
2577
+ ctx->n_buffers = 0;
2578
+
2579
+ const size_t size_page = sysconf(_SC_PAGESIZE);
2580
+ size_t size_aligned = size;
2581
+ if ((size_aligned % size_page) != 0) {
2582
+ size_aligned += (size_page - (size_aligned % size_page));
2583
+ }
2584
+
2585
+ id<MTLDevice> device = ggml_backend_metal_get_device();
2586
+
2587
+ // the buffer fits into the max buffer size allowed by the device
2588
+ if (size_aligned <= device.maxBufferLength) {
2589
+ ctx->buffers[ctx->n_buffers].data = data;
2590
+ ctx->buffers[ctx->n_buffers].size = size;
2591
+
2592
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
2593
+
2594
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
2595
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
2596
+ return false;
2597
+ }
2598
+
2599
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
2600
+
2601
+ ++ctx->n_buffers;
2602
+ } else {
2603
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
2604
+ // one of the views
2605
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
2606
+ const size_t size_step = device.maxBufferLength - size_ovlp;
2607
+ const size_t size_view = device.maxBufferLength;
2608
+
2609
+ for (size_t i = 0; i < size; i += size_step) {
2610
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
2611
+
2612
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
2613
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
2614
+
2615
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
2616
+
2617
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
2618
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
2619
+ return false;
2620
+ }
2621
+
2622
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
2623
+ if (i + size_step < size) {
2624
+ GGML_METAL_LOG_INFO("\n");
2625
+ }
2626
+
2627
+ ++ctx->n_buffers;
2628
+ }
2629
+ }
2630
+
2631
+ #if TARGET_OS_OSX
2632
+ GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
2633
+ device.currentAllocatedSize / 1024.0 / 1024.0,
2634
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
2635
+
2636
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
2637
+ GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
2638
+ } else {
2639
+ GGML_METAL_LOG_INFO("\n");
2640
+ }
2641
+ #else
2642
+ GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
2643
+ #endif
2644
+
2645
+ return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
2646
+ }
2647
+
2648
+ // backend
2649
+
2490
2650
  static const char * ggml_backend_metal_name(ggml_backend_t backend) {
2491
2651
  return "Metal";
2492
2652
 
@@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
2499
2659
  free(backend);
2500
2660
  }
2501
2661
 
2502
- static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
2503
- UNUSED(backend);
2504
- }
2505
-
2506
2662
  static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
2507
2663
  return ggml_backend_metal_buffer_type();
2508
2664
 
@@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = {
2529
2685
  /* .get_tensor_async = */ NULL,
2530
2686
  /* .cpy_tensor_from_async = */ NULL,
2531
2687
  /* .cpy_tensor_to_async = */ NULL,
2532
- /* .synchronize = */ ggml_backend_metal_synchronize,
2533
- /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
2688
+ /* .synchronize = */ NULL,
2689
+ /* .graph_plan_create = */ NULL,
2534
2690
  /* .graph_plan_free = */ NULL,
2535
2691
  /* .graph_plan_compute = */ NULL,
2536
2692
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2537
2693
  /* .supports_op = */ ggml_backend_metal_supports_op,
2538
2694
  };
2539
2695
 
2540
- // TODO: make a common log callback for all backends in ggml-backend
2541
- static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
2542
- fprintf(stderr, "%s", msg);
2543
-
2544
- UNUSED(level);
2545
- UNUSED(user_data);
2546
- }
2547
-
2548
2696
  ggml_backend_t ggml_backend_metal_init(void) {
2549
- ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
2550
-
2551
2697
  struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
2552
2698
 
2553
2699
  if (ctx == NULL) {
@@ -1702,8 +1702,9 @@ kernel void kernel_rope(
1702
1702
  dst_data[1] = x0*sin_theta + x1*cos_theta;
1703
1703
  }
1704
1704
  } else {
1705
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
1706
- for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
1705
+ for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
1706
+ if (ic < n_dims) {
1707
+ const int64_t ib = 0;
1707
1708
 
1708
1709
  // simplified from `(ib * n_dims + ic) * inv_ndims`
1709
1710
  const float cur_rot = inv_ndims*ic - ib;
@@ -1722,6 +1723,14 @@ kernel void kernel_rope(
1722
1723
 
1723
1724
  dst_data[0] = x0*cos_theta - x1*sin_theta;
1724
1725
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
1726
+ } else {
1727
+ const int64_t i0 = ic;
1728
+
1729
+ device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
1730
+ device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
1731
+
1732
+ dst_data[0] = src[0];
1733
+ dst_data[1] = src[1];
1725
1734
  }
1726
1735
  }
1727
1736
  }