llama_cpp 0.10.1 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +16 -1
- data/ext/llama_cpp/src/ggml-alloc.c +12 -4
- data/ext/llama_cpp/src/ggml-backend-impl.h +12 -8
- data/ext/llama_cpp/src/ggml-backend.c +75 -5
- data/ext/llama_cpp/src/ggml-backend.h +7 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +284 -162
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +190 -44
- data/ext/llama_cpp/src/ggml-metal.metal +11 -2
- data/ext/llama_cpp/src/ggml.c +262 -89
- data/ext/llama_cpp/src/ggml.h +24 -10
- data/ext/llama_cpp/src/llama.cpp +926 -780
- data/ext/llama_cpp/src/llama.h +8 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|
98
98
|
|
99
99
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
100
100
|
|
101
|
+
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
102
|
+
|
101
103
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
104
|
+
|
102
105
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
103
106
|
|
104
107
|
// helper to check if the device supports a specific family
|
@@ -180,7 +180,15 @@ struct ggml_metal_context {
|
|
180
180
|
@implementation GGMLMetalClass
|
181
181
|
@end
|
182
182
|
|
183
|
-
|
183
|
+
|
184
|
+
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
185
|
+
fprintf(stderr, "%s", msg);
|
186
|
+
|
187
|
+
UNUSED(level);
|
188
|
+
UNUSED(user_data);
|
189
|
+
}
|
190
|
+
|
191
|
+
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
184
192
|
void * ggml_metal_log_user_data = NULL;
|
185
193
|
|
186
194
|
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
@@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|
607
615
|
}
|
608
616
|
|
609
617
|
// temporarily defined here for compatibility between ggml-backend and the old API
|
610
|
-
|
611
|
-
|
618
|
+
|
619
|
+
struct ggml_backend_metal_buffer {
|
620
|
+
void * data;
|
621
|
+
size_t size;
|
612
622
|
|
613
623
|
id<MTLBuffer> metal;
|
614
624
|
};
|
615
625
|
|
626
|
+
struct ggml_backend_metal_buffer_context {
|
627
|
+
void * all_data;
|
628
|
+
size_t all_size;
|
629
|
+
bool owned;
|
630
|
+
|
631
|
+
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
632
|
+
int n_buffers;
|
633
|
+
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
634
|
+
};
|
635
|
+
|
616
636
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
617
637
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
618
638
|
// Metal buffer based on the host memory pointer
|
@@ -622,17 +642,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|
622
642
|
|
623
643
|
const int64_t tsize = ggml_nbytes(t);
|
624
644
|
|
645
|
+
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
646
|
+
|
625
647
|
// compatibility with ggml-backend
|
626
|
-
if (
|
627
|
-
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)
|
648
|
+
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
649
|
+
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
650
|
+
|
651
|
+
// find the view that contains the tensor fully
|
652
|
+
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
653
|
+
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
628
654
|
|
629
|
-
|
655
|
+
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
656
|
+
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
657
|
+
*offs = (size_t) ioffs;
|
630
658
|
|
631
|
-
|
659
|
+
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
660
|
+
|
661
|
+
return buf_ctx->buffers[i].metal;
|
662
|
+
}
|
663
|
+
}
|
632
664
|
|
633
|
-
|
665
|
+
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
634
666
|
|
635
|
-
return
|
667
|
+
return nil;
|
636
668
|
}
|
637
669
|
|
638
670
|
// find the view that contains the tensor fully
|
@@ -1261,7 +1293,7 @@ void ggml_metal_graph_compute(
|
|
1261
1293
|
{
|
1262
1294
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
1263
1295
|
|
1264
|
-
const float scale = *(const float *)
|
1296
|
+
const float scale = *(const float *) dst->op_params;
|
1265
1297
|
|
1266
1298
|
int64_t n = ggml_nelements(dst);
|
1267
1299
|
|
@@ -1272,8 +1304,8 @@ void ggml_metal_graph_compute(
|
|
1272
1304
|
[encoder setComputePipelineState:ctx->pipeline_scale];
|
1273
1305
|
}
|
1274
1306
|
|
1275
|
-
[encoder setBuffer:id_src0
|
1276
|
-
[encoder setBuffer:id_dst
|
1307
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1308
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
1277
1309
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
1278
1310
|
|
1279
1311
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
@@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute(
|
|
2361
2393
|
|
2362
2394
|
// backend interface
|
2363
2395
|
|
2396
|
+
// default buffer
|
2364
2397
|
static id<MTLDevice> g_backend_device = nil;
|
2365
2398
|
static int g_backend_device_ref_count = 0;
|
2366
2399
|
|
@@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) {
|
|
2388
2421
|
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
2389
2422
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
2390
2423
|
|
2391
|
-
return ctx->
|
2424
|
+
return ctx->all_data;
|
2392
2425
|
}
|
2393
2426
|
|
2394
2427
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
2395
2428
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
2396
2429
|
|
2397
|
-
|
2430
|
+
for (int i = 0; i < ctx->n_buffers; i++) {
|
2431
|
+
[ctx->buffers[i].metal release];
|
2432
|
+
}
|
2398
2433
|
ggml_backend_metal_free_device();
|
2399
2434
|
|
2400
|
-
|
2401
|
-
|
2435
|
+
if (ctx->owned) {
|
2436
|
+
free(ctx->all_data);
|
2437
|
+
}
|
2402
2438
|
|
2403
|
-
|
2439
|
+
free(ctx);
|
2404
2440
|
}
|
2405
2441
|
|
2406
2442
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
2407
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
2408
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
2409
|
-
|
2410
2443
|
memcpy((char *)tensor->data + offset, data, size);
|
2411
2444
|
|
2412
2445
|
UNUSED(buffer);
|
2413
2446
|
}
|
2414
2447
|
|
2415
2448
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
2416
|
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
2417
|
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
2418
|
-
|
2419
2449
|
memcpy(data, (const char *)tensor->data + offset, size);
|
2420
2450
|
|
2421
2451
|
UNUSED(buffer);
|
@@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
|
|
2433
2463
|
UNUSED(buffer);
|
2434
2464
|
}
|
2435
2465
|
|
2436
|
-
static
|
2466
|
+
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
2467
|
+
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
2468
|
+
|
2469
|
+
memset(ctx->all_data, value, ctx->all_size);
|
2470
|
+
}
|
2471
|
+
|
2472
|
+
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
2437
2473
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
2438
2474
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
2439
2475
|
/* .init_tensor = */ NULL,
|
@@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
|
2441
2477
|
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
2442
2478
|
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
2443
2479
|
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
2480
|
+
/* .clear = */ ggml_backend_metal_buffer_clear,
|
2444
2481
|
};
|
2445
2482
|
|
2483
|
+
// default buffer type
|
2484
|
+
|
2446
2485
|
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
2447
2486
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
2448
2487
|
|
@@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|
2453
2492
|
size_aligned += (size_page - (size_aligned % size_page));
|
2454
2493
|
}
|
2455
2494
|
|
2456
|
-
|
2457
|
-
|
2495
|
+
id<MTLDevice> device = ggml_backend_metal_get_device();
|
2496
|
+
|
2497
|
+
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
2498
|
+
ctx->all_size = size_aligned;
|
2499
|
+
ctx->owned = true;
|
2500
|
+
ctx->n_buffers = 1;
|
2501
|
+
|
2502
|
+
ctx->buffers[0].data = ctx->all_data;
|
2503
|
+
ctx->buffers[0].size = size;
|
2504
|
+
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
2458
2505
|
length:size_aligned
|
2459
2506
|
options:MTLResourceStorageModeShared
|
2460
2507
|
deallocator:nil];
|
2461
2508
|
|
2462
|
-
|
2509
|
+
if (ctx->buffers[0].metal == nil) {
|
2510
|
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
2511
|
+
free(ctx);
|
2512
|
+
ggml_backend_metal_free_device();
|
2513
|
+
return NULL;
|
2514
|
+
}
|
2515
|
+
|
2516
|
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
2517
|
+
|
2518
|
+
|
2519
|
+
#if TARGET_OS_OSX
|
2520
|
+
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
2521
|
+
device.currentAllocatedSize / 1024.0 / 1024.0,
|
2522
|
+
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
2523
|
+
|
2524
|
+
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
2525
|
+
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
2526
|
+
} else {
|
2527
|
+
GGML_METAL_LOG_INFO("\n");
|
2528
|
+
}
|
2529
|
+
#else
|
2530
|
+
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
2531
|
+
#endif
|
2532
|
+
|
2533
|
+
|
2534
|
+
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
2463
2535
|
}
|
2464
2536
|
|
2465
2537
|
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
@@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
|
|
2470
2542
|
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
2471
2543
|
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
2472
2544
|
|
2473
|
-
|
2545
|
+
UNUSED(buft);
|
2546
|
+
}
|
2547
|
+
|
2548
|
+
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
2549
|
+
return true;
|
2550
|
+
|
2551
|
+
UNUSED(buft);
|
2474
2552
|
}
|
2475
2553
|
|
2476
2554
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
@@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
2480
2558
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
2481
2559
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
2482
2560
|
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
2561
|
+
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
2483
2562
|
},
|
2484
2563
|
/* .context = */ NULL,
|
2485
2564
|
};
|
@@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
2487
2566
|
return &ggml_backend_buffer_type_metal;
|
2488
2567
|
}
|
2489
2568
|
|
2569
|
+
// buffer from ptr
|
2570
|
+
|
2571
|
+
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
2572
|
+
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
2573
|
+
|
2574
|
+
ctx->all_data = data;
|
2575
|
+
ctx->all_size = size;
|
2576
|
+
ctx->owned = false;
|
2577
|
+
ctx->n_buffers = 0;
|
2578
|
+
|
2579
|
+
const size_t size_page = sysconf(_SC_PAGESIZE);
|
2580
|
+
size_t size_aligned = size;
|
2581
|
+
if ((size_aligned % size_page) != 0) {
|
2582
|
+
size_aligned += (size_page - (size_aligned % size_page));
|
2583
|
+
}
|
2584
|
+
|
2585
|
+
id<MTLDevice> device = ggml_backend_metal_get_device();
|
2586
|
+
|
2587
|
+
// the buffer fits into the max buffer size allowed by the device
|
2588
|
+
if (size_aligned <= device.maxBufferLength) {
|
2589
|
+
ctx->buffers[ctx->n_buffers].data = data;
|
2590
|
+
ctx->buffers[ctx->n_buffers].size = size;
|
2591
|
+
|
2592
|
+
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
2593
|
+
|
2594
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
2595
|
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
2596
|
+
return false;
|
2597
|
+
}
|
2598
|
+
|
2599
|
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
2600
|
+
|
2601
|
+
++ctx->n_buffers;
|
2602
|
+
} else {
|
2603
|
+
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
2604
|
+
// one of the views
|
2605
|
+
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
2606
|
+
const size_t size_step = device.maxBufferLength - size_ovlp;
|
2607
|
+
const size_t size_view = device.maxBufferLength;
|
2608
|
+
|
2609
|
+
for (size_t i = 0; i < size; i += size_step) {
|
2610
|
+
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
2611
|
+
|
2612
|
+
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
2613
|
+
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
2614
|
+
|
2615
|
+
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
2616
|
+
|
2617
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
2618
|
+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
2619
|
+
return false;
|
2620
|
+
}
|
2621
|
+
|
2622
|
+
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
|
2623
|
+
if (i + size_step < size) {
|
2624
|
+
GGML_METAL_LOG_INFO("\n");
|
2625
|
+
}
|
2626
|
+
|
2627
|
+
++ctx->n_buffers;
|
2628
|
+
}
|
2629
|
+
}
|
2630
|
+
|
2631
|
+
#if TARGET_OS_OSX
|
2632
|
+
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
2633
|
+
device.currentAllocatedSize / 1024.0 / 1024.0,
|
2634
|
+
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
2635
|
+
|
2636
|
+
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
2637
|
+
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
2638
|
+
} else {
|
2639
|
+
GGML_METAL_LOG_INFO("\n");
|
2640
|
+
}
|
2641
|
+
#else
|
2642
|
+
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
2643
|
+
#endif
|
2644
|
+
|
2645
|
+
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
2646
|
+
}
|
2647
|
+
|
2648
|
+
// backend
|
2649
|
+
|
2490
2650
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
2491
2651
|
return "Metal";
|
2492
2652
|
|
@@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|
2499
2659
|
free(backend);
|
2500
2660
|
}
|
2501
2661
|
|
2502
|
-
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
2503
|
-
UNUSED(backend);
|
2504
|
-
}
|
2505
|
-
|
2506
2662
|
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
2507
2663
|
return ggml_backend_metal_buffer_type();
|
2508
2664
|
|
@@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = {
|
|
2529
2685
|
/* .get_tensor_async = */ NULL,
|
2530
2686
|
/* .cpy_tensor_from_async = */ NULL,
|
2531
2687
|
/* .cpy_tensor_to_async = */ NULL,
|
2532
|
-
/* .synchronize = */
|
2533
|
-
/* .graph_plan_create = */ NULL,
|
2688
|
+
/* .synchronize = */ NULL,
|
2689
|
+
/* .graph_plan_create = */ NULL,
|
2534
2690
|
/* .graph_plan_free = */ NULL,
|
2535
2691
|
/* .graph_plan_compute = */ NULL,
|
2536
2692
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
2537
2693
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
2538
2694
|
};
|
2539
2695
|
|
2540
|
-
// TODO: make a common log callback for all backends in ggml-backend
|
2541
|
-
static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
2542
|
-
fprintf(stderr, "%s", msg);
|
2543
|
-
|
2544
|
-
UNUSED(level);
|
2545
|
-
UNUSED(user_data);
|
2546
|
-
}
|
2547
|
-
|
2548
2696
|
ggml_backend_t ggml_backend_metal_init(void) {
|
2549
|
-
ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
|
2550
|
-
|
2551
2697
|
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
2552
2698
|
|
2553
2699
|
if (ctx == NULL) {
|
@@ -1702,8 +1702,9 @@ kernel void kernel_rope(
|
|
1702
1702
|
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
1703
1703
|
}
|
1704
1704
|
} else {
|
1705
|
-
for (int64_t
|
1706
|
-
|
1705
|
+
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
1706
|
+
if (ic < n_dims) {
|
1707
|
+
const int64_t ib = 0;
|
1707
1708
|
|
1708
1709
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
1709
1710
|
const float cur_rot = inv_ndims*ic - ib;
|
@@ -1722,6 +1723,14 @@ kernel void kernel_rope(
|
|
1722
1723
|
|
1723
1724
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
1724
1725
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
1726
|
+
} else {
|
1727
|
+
const int64_t i0 = ic;
|
1728
|
+
|
1729
|
+
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
1730
|
+
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
1731
|
+
|
1732
|
+
dst_data[0] = src[0];
|
1733
|
+
dst_data[1] = src[1];
|
1725
1734
|
}
|
1726
1735
|
}
|
1727
1736
|
}
|