llama_cpp 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|
98
98
|
|
99
99
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
100
100
|
|
101
|
+
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
102
|
+
|
101
103
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
104
|
+
|
102
105
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
103
106
|
|
104
107
|
// helper to check if the device supports a specific family
|
@@ -180,7 +180,15 @@ struct ggml_metal_context {
|
|
180
180
|
@implementation GGMLMetalClass
|
181
181
|
@end
|
182
182
|
|
183
|
-
|
183
|
+
|
184
|
+
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
185
|
+
fprintf(stderr, "%s", msg);
|
186
|
+
|
187
|
+
UNUSED(level);
|
188
|
+
UNUSED(user_data);
|
189
|
+
}
|
190
|
+
|
191
|
+
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
184
192
|
void * ggml_metal_log_user_data = NULL;
|
185
193
|
|
186
194
|
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
@@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|
607
615
|
}
|
608
616
|
|
609
617
|
// temporarily defined here for compatibility between ggml-backend and the old API
|
610
|
-
|
611
|
-
|
618
|
+
|
619
|
+
struct ggml_backend_metal_buffer {
|
620
|
+
void * data;
|
621
|
+
size_t size;
|
612
622
|
|
613
623
|
id<MTLBuffer> metal;
|
614
624
|
};
|
615
625
|
|
626
|
+
struct ggml_backend_metal_buffer_context {
|
627
|
+
void * all_data;
|
628
|
+
size_t all_size;
|
629
|
+
bool owned;
|
630
|
+
|
631
|
+
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
632
|
+
int n_buffers;
|
633
|
+
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
634
|
+
};
|
635
|
+
|
616
636
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
617
637
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
618
638
|
// Metal buffer based on the host memory pointer
|
@@ -622,17 +642,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|
622
642
|
|
623
643
|
const int64_t tsize = ggml_nbytes(t);
|
624
644
|
|
645
|
+
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
646
|
+
|
625
647
|
// compatibility with ggml-backend
|
626
|
-
if (
|
627
|
-
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)
|
648
|
+
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
649
|
+
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
650
|
+
|
651
|
+
// find the view that contains the tensor fully
|
652
|
+
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
653
|
+
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
628
654
|
|
629
|
-
|
655
|
+
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
656
|
+
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
657
|
+
*offs = (size_t) ioffs;
|
630
658
|
|
631
|
-
|
659
|
+
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
660
|
+
|
661
|
+
return buf_ctx->buffers[i].metal;
|
662
|
+
}
|
663
|
+
}
|
632
664
|
|
633
|
-
|
665
|
+
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
634
666
|
|
635
|
-
return
|
667
|
+
return nil;
|
636
668
|
}
|
637
669
|
|
638
670
|
// find the view that contains the tensor fully
|
@@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute(
|
|
1261
1293
|
{
|
1262
1294
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
1263
1295
|
|
1264
|
-
const float scale = *(const float *)
|
1296
|
+
const float scale = *(const float *) dst->op_params;
|
1265
1297
|
|
1266
1298
|
int64_t n = ggml_nelements(dst);
|
1267
1299
|
|
@@ -1272,8 +1304,8 @@ void ggml_metal_graph_compute(
|
|
1272
1304
|
[encoder setComputePipelineState:ctx->pipeline_scale];
|
1273
1305
|
}
|
1274
1306
|
|
1275
|
-
[encoder setBuffer:id_src0
|
1276
|
-
[encoder setBuffer:id_dst
|
1307
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1308
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
1277
1309
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
1278
1310
|
|
1279
1311
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
@@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute(
|
|
2361
2393
|
|
2362
2394
|
// backend interface
|
2363
2395
|
|
2396
|
+
// default buffer
|
2364
2397
|
static id<MTLDevice> g_backend_device = nil;
|
2365
2398
|
static int g_backend_device_ref_count = 0;
|
2366
2399
|
|
@@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) {
|
|
2388
2421
|
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
2389
2422
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
2390
2423
|
|
2391
|
-
return ctx->
|
2424
|
+
return ctx->all_data;
|
2392
2425
|
}
|
2393
2426
|
|
2394
2427
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
2395
2428
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
2396
2429
|
|
2397
|
-
|
2430
|
+
for (int i = 0; i < ctx->n_buffers; i++) {
|
2431
|
+
[ctx->buffers[i].metal release];
|
2432
|
+
}
|
2398
2433
|
ggml_backend_metal_free_device();
|
2399
2434
|
|
2400
|
-
|
2401
|
-
|
2435
|
+
if (ctx->owned) {
|
2436
|
+
free(ctx->all_data);
|
2437
|
+
}
|
2402
2438
|
|
2403
|
-
|
2439
|
+
free(ctx);
|
2404
2440
|
}
|
2405
2441
|
|
2406
2442
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
2407
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
2408
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
2409
|
-
|
2410
2443
|
memcpy((char *)tensor->data + offset, data, size);
|
2411
2444
|
|
2412
2445
|
UNUSED(buffer);
|
2413
2446
|
}
|
2414
2447
|
|
2415
2448
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
2416
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
2417
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
2418
|
-
|
2419
2449
|
memcpy(data, (const char *)tensor->data + offset, size);
|
2420
2450
|
|
2421
2451
|
UNUSED(buffer);
|
@@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
|
|
2433
2463
|
UNUSED(buffer);
|
2434
2464
|
}
|
2435
2465
|
|
2436
|
-
static
|
2466
|
+
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
2467
|
+
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
2468
|
+
|
2469
|
+
memset(ctx->all_data, value, ctx->all_size);
|
2470
|
+
}
|
2471
|
+
|
2472
|
+
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
2437
2473
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
2438
2474
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
2439
2475
|
/* .init_tensor = */ NULL,
|
@@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
|
2441
2477
|
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
2442
2478
|
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
2443
2479
|
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
2480
|
+
/* .clear = */ ggml_backend_metal_buffer_clear,
|
2444
2481
|
};
|
2445
2482
|
|
2483
|
+
// default buffer type
|
2484
|
+
|
2446
2485
|
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
2447
2486
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
2448
2487
|
|
@@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|
2453
2492
|
size_aligned += (size_page - (size_aligned % size_page));
|
2454
2493
|
}
|
2455
2494
|
|
2456
|
-
|
2457
|
-
|
2495
|
+
id<MTLDevice> device = ggml_backend_metal_get_device();
|
2496
|
+
|
2497
|
+
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
2498
|
+
ctx->all_size = size_aligned;
|
2499
|
+
ctx->owned = true;
|
2500
|
+
ctx->n_buffers = 1;
|
2501
|
+
|
2502
|
+
ctx->buffers[0].data = ctx->all_data;
|
2503
|
+
ctx->buffers[0].size = size;
|
2504
|
+
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
2458
2505
|
length:size_aligned
|
2459
2506
|
options:MTLResourceStorageModeShared
|
2460
2507
|
deallocator:nil];
|
2461
2508
|
|
2462
|
-
|
2509
|
+
if (ctx->buffers[0].metal == nil) {
|
2510
|
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
2511
|
+
free(ctx);
|
2512
|
+
ggml_backend_metal_free_device();
|
2513
|
+
return NULL;
|
2514
|
+
}
|
2515
|
+
|
2516
|
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
2517
|
+
|
2518
|
+
|
2519
|
+
#if TARGET_OS_OSX
|
2520
|
+
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
2521
|
+
device.currentAllocatedSize / 1024.0 / 1024.0,
|
2522
|
+
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
2523
|
+
|
2524
|
+
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
2525
|
+
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
2526
|
+
} else {
|
2527
|
+
GGML_METAL_LOG_INFO("\n");
|
2528
|
+
}
|
2529
|
+
#else
|
2530
|
+
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
2531
|
+
#endif
|
2532
|
+
|
2533
|
+
|
2534
|
+
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
2463
2535
|
}
|
2464
2536
|
|
2465
2537
|
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
@@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
|
|
2470
2542
|
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
2471
2543
|
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
2472
2544
|
|
2473
|
-
|
2545
|
+
UNUSED(buft);
|
2546
|
+
}
|
2547
|
+
|
2548
|
+
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
2549
|
+
return true;
|
2550
|
+
|
2551
|
+
UNUSED(buft);
|
2474
2552
|
}
|
2475
2553
|
|
2476
2554
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
@@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
2480
2558
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
2481
2559
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
2482
2560
|
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
2561
|
+
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
2483
2562
|
},
|
2484
2563
|
/* .context = */ NULL,
|
2485
2564
|
};
|
@@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
2487
2566
|
return &ggml_backend_buffer_type_metal;
|
2488
2567
|
}
|
2489
2568
|
|
2569
|
+
// buffer from ptr
|
2570
|
+
|
2571
|
+
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
2572
|
+
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
2573
|
+
|
2574
|
+
ctx->all_data = data;
|
2575
|
+
ctx->all_size = size;
|
2576
|
+
ctx->owned = false;
|
2577
|
+
ctx->n_buffers = 0;
|
2578
|
+
|
2579
|
+
const size_t size_page = sysconf(_SC_PAGESIZE);
|
2580
|
+
size_t size_aligned = size;
|
2581
|
+
if ((size_aligned % size_page) != 0) {
|
2582
|
+
size_aligned += (size_page - (size_aligned % size_page));
|
2583
|
+
}
|
2584
|
+
|
2585
|
+
id<MTLDevice> device = ggml_backend_metal_get_device();
|
2586
|
+
|
2587
|
+
// the buffer fits into the max buffer size allowed by the device
|
2588
|
+
if (size_aligned <= device.maxBufferLength) {
|
2589
|
+
ctx->buffers[ctx->n_buffers].data = data;
|
2590
|
+
ctx->buffers[ctx->n_buffers].size = size;
|
2591
|
+
|
2592
|
+
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
2593
|
+
|
2594
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
2595
|
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
2596
|
+
return false;
|
2597
|
+
}
|
2598
|
+
|
2599
|
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
2600
|
+
|
2601
|
+
++ctx->n_buffers;
|
2602
|
+
} else {
|
2603
|
+
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
2604
|
+
// one of the views
|
2605
|
+
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
2606
|
+
const size_t size_step = device.maxBufferLength - size_ovlp;
|
2607
|
+
const size_t size_view = device.maxBufferLength;
|
2608
|
+
|
2609
|
+
for (size_t i = 0; i < size; i += size_step) {
|
2610
|
+
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
2611
|
+
|
2612
|
+
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
2613
|
+
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
2614
|
+
|
2615
|
+
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
2616
|
+
|
2617
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
2618
|
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
2619
|
+
return false;
|
2620
|
+
}
|
2621
|
+
|
2622
|
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
|
2623
|
+
if (i + size_step < size) {
|
2624
|
+
GGML_METAL_LOG_INFO("\n");
|
2625
|
+
}
|
2626
|
+
|
2627
|
+
++ctx->n_buffers;
|
2628
|
+
}
|
2629
|
+
}
|
2630
|
+
|
2631
|
+
#if TARGET_OS_OSX
|
2632
|
+
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
2633
|
+
device.currentAllocatedSize / 1024.0 / 1024.0,
|
2634
|
+
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
2635
|
+
|
2636
|
+
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
2637
|
+
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
2638
|
+
} else {
|
2639
|
+
GGML_METAL_LOG_INFO("\n");
|
2640
|
+
}
|
2641
|
+
#else
|
2642
|
+
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
2643
|
+
#endif
|
2644
|
+
|
2645
|
+
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
2646
|
+
}
|
2647
|
+
|
2648
|
+
// backend
|
2649
|
+
|
2490
2650
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
2491
2651
|
return "Metal";
|
2492
2652
|
|
@@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|
2499
2659
|
free(backend);
|
2500
2660
|
}
|
2501
2661
|
|
2502
|
-
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
2503
|
-
UNUSED(backend);
|
2504
|
-
}
|
2505
|
-
|
2506
2662
|
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
2507
2663
|
return ggml_backend_metal_buffer_type();
|
2508
2664
|
|
@@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = {
|
|
2529
2685
|
/* .get_tensor_async = */ NULL,
|
2530
2686
|
/* .cpy_tensor_from_async = */ NULL,
|
2531
2687
|
/* .cpy_tensor_to_async = */ NULL,
|
2532
|
-
/* .synchronize = */
|
2533
|
-
/* .graph_plan_create = */ NULL,
|
2688
|
+
/* .synchronize = */ NULL,
|
2689
|
+
/* .graph_plan_create = */ NULL,
|
2534
2690
|
/* .graph_plan_free = */ NULL,
|
2535
2691
|
/* .graph_plan_compute = */ NULL,
|
2536
2692
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
2537
2693
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
2538
2694
|
};
|
2539
2695
|
|
2540
|
-
// TODO: make a common log callback for all backends in ggml-backend
|
2541
|
-
static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
2542
|
-
fprintf(stderr, "%s", msg);
|
2543
|
-
|
2544
|
-
UNUSED(level);
|
2545
|
-
UNUSED(user_data);
|
2546
|
-
}
|
2547
|
-
|
2548
2696
|
ggml_backend_t ggml_backend_metal_init(void) {
|
2549
|
-
ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
|
2550
|
-
|
2551
2697
|
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
2552
2698
|
|
2553
2699
|
if (ctx == NULL) {
|
@@ -1702,8 +1702,9 @@ kernel void kernel_rope(
|
|
1702
1702
|
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
1703
1703
|
}
|
1704
1704
|
} else {
|
1705
|
-
for (int64_t
|
1706
|
-
|
1705
|
+
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
1706
|
+
if (ic < n_dims) {
|
1707
|
+
const int64_t ib = 0;
|
1707
1708
|
|
1708
1709
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
1709
1710
|
const float cur_rot = inv_ndims*ic - ib;
|
@@ -1722,6 +1723,14 @@ kernel void kernel_rope(
|
|
1722
1723
|
|
1723
1724
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
1724
1725
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
1726
|
+
} else {
|
1727
|
+
const int64_t i0 = ic;
|
1728
|
+
|
1729
|
+
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
1730
|
+
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
1731
|
+
|
1732
|
+
dst_data[0] = src[0];
|
1733
|
+
dst_data[1] = src[1];
|
1725
1734
|
}
|
1726
1735
|
}
|
1727
1736
|
}
|