llama_cpp 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
98
98
 
99
99
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
100
100
 
101
+ GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
102
+
101
103
  GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
104
+
102
105
  GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
103
106
 
104
107
  // helper to check if the device supports a specific family
@@ -180,7 +180,15 @@ struct ggml_metal_context {
180
180
  @implementation GGMLMetalClass
181
181
  @end
182
182
 
183
- ggml_log_callback ggml_metal_log_callback = NULL;
183
+
184
+ static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
185
+ fprintf(stderr, "%s", msg);
186
+
187
+ UNUSED(level);
188
+ UNUSED(user_data);
189
+ }
190
+
191
+ ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
184
192
  void * ggml_metal_log_user_data = NULL;
185
193
 
186
194
  void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
@@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
607
615
  }
608
616
 
609
617
  // temporarily defined here for compatibility between ggml-backend and the old API
610
- struct ggml_backend_metal_buffer_context {
611
- void * data;
618
+
619
+ struct ggml_backend_metal_buffer {
620
+ void * data;
621
+ size_t size;
612
622
 
613
623
  id<MTLBuffer> metal;
614
624
  };
615
625
 
626
+ struct ggml_backend_metal_buffer_context {
627
+ void * all_data;
628
+ size_t all_size;
629
+ bool owned;
630
+
631
+ // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
632
+ int n_buffers;
633
+ struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
634
+ };
635
+
616
636
  // finds the Metal buffer that contains the tensor data on the GPU device
617
637
  // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
618
638
  // Metal buffer based on the host memory pointer
@@ -622,17 +642,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
622
642
 
623
643
  const int64_t tsize = ggml_nbytes(t);
624
644
 
645
+ ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
646
+
625
647
  // compatibility with ggml-backend
626
- if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
627
- struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
648
+ if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
649
+ struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
650
+
651
+ // find the view that contains the tensor fully
652
+ for (int i = 0; i < buf_ctx->n_buffers; ++i) {
653
+ const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
628
654
 
629
- const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
655
+ //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
656
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
657
+ *offs = (size_t) ioffs;
630
658
 
631
- GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
659
+ //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
660
+
661
+ return buf_ctx->buffers[i].metal;
662
+ }
663
+ }
632
664
 
633
- *offs = (size_t) ioffs;
665
+ GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
634
666
 
635
- return buf_ctx->metal;
667
+ return nil;
636
668
  }
637
669
 
638
670
  // find the view that contains the tensor fully
@@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute(
1261
1293
  {
1262
1294
  GGML_ASSERT(ggml_is_contiguous(src0));
1263
1295
 
1264
- const float scale = *(const float *) src1->data;
1296
+ const float scale = *(const float *) dst->op_params;
1265
1297
 
1266
1298
  int64_t n = ggml_nelements(dst);
1267
1299
 
@@ -1272,8 +1304,8 @@ void ggml_metal_graph_compute(
1272
1304
  [encoder setComputePipelineState:ctx->pipeline_scale];
1273
1305
  }
1274
1306
 
1275
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1276
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1307
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1308
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1277
1309
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
1278
1310
 
1279
1311
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
@@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute(
2361
2393
 
2362
2394
  // backend interface
2363
2395
 
2396
+ // default buffer
2364
2397
  static id<MTLDevice> g_backend_device = nil;
2365
2398
  static int g_backend_device_ref_count = 0;
2366
2399
 
@@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) {
2388
2421
  static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
2389
2422
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2390
2423
 
2391
- return ctx->data;
2424
+ return ctx->all_data;
2392
2425
  }
2393
2426
 
2394
2427
  static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2395
2428
  struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2396
2429
 
2397
- [ctx->metal release];
2430
+ for (int i = 0; i < ctx->n_buffers; i++) {
2431
+ [ctx->buffers[i].metal release];
2432
+ }
2398
2433
  ggml_backend_metal_free_device();
2399
2434
 
2400
- free(ctx->data);
2401
- free(ctx);
2435
+ if (ctx->owned) {
2436
+ free(ctx->all_data);
2437
+ }
2402
2438
 
2403
- UNUSED(buffer);
2439
+ free(ctx);
2404
2440
  }
2405
2441
 
2406
2442
  static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2407
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
2408
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
2409
-
2410
2443
  memcpy((char *)tensor->data + offset, data, size);
2411
2444
 
2412
2445
  UNUSED(buffer);
2413
2446
  }
2414
2447
 
2415
2448
  static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2416
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
2417
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
2418
-
2419
2449
  memcpy(data, (const char *)tensor->data + offset, size);
2420
2450
 
2421
2451
  UNUSED(buffer);
@@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
2433
2463
  UNUSED(buffer);
2434
2464
  }
2435
2465
 
2436
- static struct ggml_backend_buffer_i metal_backend_buffer_i = {
2466
+ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2467
+ struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2468
+
2469
+ memset(ctx->all_data, value, ctx->all_size);
2470
+ }
2471
+
2472
+ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
2437
2473
  /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
2438
2474
  /* .get_base = */ ggml_backend_metal_buffer_get_base,
2439
2475
  /* .init_tensor = */ NULL,
@@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
2441
2477
  /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
2442
2478
  /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
2443
2479
  /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
2480
+ /* .clear = */ ggml_backend_metal_buffer_clear,
2444
2481
  };
2445
2482
 
2483
+ // default buffer type
2484
+
2446
2485
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2447
2486
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2448
2487
 
@@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
2453
2492
  size_aligned += (size_page - (size_aligned % size_page));
2454
2493
  }
2455
2494
 
2456
- ctx->data = ggml_metal_host_malloc(size);
2457
- ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
2495
+ id<MTLDevice> device = ggml_backend_metal_get_device();
2496
+
2497
+ ctx->all_data = ggml_metal_host_malloc(size_aligned);
2498
+ ctx->all_size = size_aligned;
2499
+ ctx->owned = true;
2500
+ ctx->n_buffers = 1;
2501
+
2502
+ ctx->buffers[0].data = ctx->all_data;
2503
+ ctx->buffers[0].size = size;
2504
+ ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
2458
2505
  length:size_aligned
2459
2506
  options:MTLResourceStorageModeShared
2460
2507
  deallocator:nil];
2461
2508
 
2462
- return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
2509
+ if (ctx->buffers[0].metal == nil) {
2510
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
2511
+ free(ctx);
2512
+ ggml_backend_metal_free_device();
2513
+ return NULL;
2514
+ }
2515
+
2516
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
2517
+
2518
+
2519
+ #if TARGET_OS_OSX
2520
+ GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
2521
+ device.currentAllocatedSize / 1024.0 / 1024.0,
2522
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
2523
+
2524
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
2525
+ GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
2526
+ } else {
2527
+ GGML_METAL_LOG_INFO("\n");
2528
+ }
2529
+ #else
2530
+ GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
2531
+ #endif
2532
+
2533
+
2534
+ return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
2463
2535
  }
2464
2536
 
2465
2537
  static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
2470
2542
  static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
2471
2543
  return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
2472
2544
 
2473
- GGML_UNUSED(buft);
2545
+ UNUSED(buft);
2546
+ }
2547
+
2548
+ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
2549
+ return true;
2550
+
2551
+ UNUSED(buft);
2474
2552
  }
2475
2553
 
2476
2554
  ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2480
2558
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
2481
2559
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
2482
2560
  /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
2561
+ /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
2483
2562
  },
2484
2563
  /* .context = */ NULL,
2485
2564
  };
@@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2487
2566
  return &ggml_backend_buffer_type_metal;
2488
2567
  }
2489
2568
 
2569
+ // buffer from ptr
2570
+
2571
+ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
2572
+ struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2573
+
2574
+ ctx->all_data = data;
2575
+ ctx->all_size = size;
2576
+ ctx->owned = false;
2577
+ ctx->n_buffers = 0;
2578
+
2579
+ const size_t size_page = sysconf(_SC_PAGESIZE);
2580
+ size_t size_aligned = size;
2581
+ if ((size_aligned % size_page) != 0) {
2582
+ size_aligned += (size_page - (size_aligned % size_page));
2583
+ }
2584
+
2585
+ id<MTLDevice> device = ggml_backend_metal_get_device();
2586
+
2587
+ // the buffer fits into the max buffer size allowed by the device
2588
+ if (size_aligned <= device.maxBufferLength) {
2589
+ ctx->buffers[ctx->n_buffers].data = data;
2590
+ ctx->buffers[ctx->n_buffers].size = size;
2591
+
2592
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
2593
+
2594
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
2595
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
2596
+ return false;
2597
+ }
2598
+
2599
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
2600
+
2601
+ ++ctx->n_buffers;
2602
+ } else {
2603
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
2604
+ // one of the views
2605
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
2606
+ const size_t size_step = device.maxBufferLength - size_ovlp;
2607
+ const size_t size_view = device.maxBufferLength;
2608
+
2609
+ for (size_t i = 0; i < size; i += size_step) {
2610
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
2611
+
2612
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
2613
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
2614
+
2615
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
2616
+
2617
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
2618
+ GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
2619
+ return false;
2620
+ }
2621
+
2622
+ GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
2623
+ if (i + size_step < size) {
2624
+ GGML_METAL_LOG_INFO("\n");
2625
+ }
2626
+
2627
+ ++ctx->n_buffers;
2628
+ }
2629
+ }
2630
+
2631
+ #if TARGET_OS_OSX
2632
+ GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
2633
+ device.currentAllocatedSize / 1024.0 / 1024.0,
2634
+ device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
2635
+
2636
+ if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
2637
+ GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
2638
+ } else {
2639
+ GGML_METAL_LOG_INFO("\n");
2640
+ }
2641
+ #else
2642
+ GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
2643
+ #endif
2644
+
2645
+ return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
2646
+ }
2647
+
2648
+ // backend
2649
+
2490
2650
  static const char * ggml_backend_metal_name(ggml_backend_t backend) {
2491
2651
  return "Metal";
2492
2652
 
@@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
2499
2659
  free(backend);
2500
2660
  }
2501
2661
 
2502
- static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
2503
- UNUSED(backend);
2504
- }
2505
-
2506
2662
  static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
2507
2663
  return ggml_backend_metal_buffer_type();
2508
2664
 
@@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = {
2529
2685
  /* .get_tensor_async = */ NULL,
2530
2686
  /* .cpy_tensor_from_async = */ NULL,
2531
2687
  /* .cpy_tensor_to_async = */ NULL,
2532
- /* .synchronize = */ ggml_backend_metal_synchronize,
2533
- /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
2688
+ /* .synchronize = */ NULL,
2689
+ /* .graph_plan_create = */ NULL,
2534
2690
  /* .graph_plan_free = */ NULL,
2535
2691
  /* .graph_plan_compute = */ NULL,
2536
2692
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
2537
2693
  /* .supports_op = */ ggml_backend_metal_supports_op,
2538
2694
  };
2539
2695
 
2540
- // TODO: make a common log callback for all backends in ggml-backend
2541
- static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
2542
- fprintf(stderr, "%s", msg);
2543
-
2544
- UNUSED(level);
2545
- UNUSED(user_data);
2546
- }
2547
-
2548
2696
  ggml_backend_t ggml_backend_metal_init(void) {
2549
- ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
2550
-
2551
2697
  struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
2552
2698
 
2553
2699
  if (ctx == NULL) {
@@ -1702,8 +1702,9 @@ kernel void kernel_rope(
1702
1702
  dst_data[1] = x0*sin_theta + x1*cos_theta;
1703
1703
  }
1704
1704
  } else {
1705
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
1706
- for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
1705
+ for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
1706
+ if (ic < n_dims) {
1707
+ const int64_t ib = 0;
1707
1708
 
1708
1709
  // simplified from `(ib * n_dims + ic) * inv_ndims`
1709
1710
  const float cur_rot = inv_ndims*ic - ib;
@@ -1722,6 +1723,14 @@ kernel void kernel_rope(
1722
1723
 
1723
1724
  dst_data[0] = x0*cos_theta - x1*sin_theta;
1724
1725
  dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
1726
+ } else {
1727
+ const int64_t i0 = ic;
1728
+
1729
+ device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
1730
+ device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
1731
+
1732
+ dst_data[0] = src[0];
1733
+ dst_data[1] = src[1];
1725
1734
  }
1726
1735
  }
1727
1736
  }