llama_cpp 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,6 +91,8 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
+
#define LLAMA_MAX_NODES 4096
|
95
|
+
|
94
96
|
//
|
95
97
|
// logging
|
96
98
|
//
|
@@ -190,6 +192,7 @@ enum llm_arch {
|
|
190
192
|
LLM_ARCH_PERSIMMON,
|
191
193
|
LLM_ARCH_REFACT,
|
192
194
|
LLM_ARCH_BLOOM,
|
195
|
+
LLM_ARCH_STABLELM,
|
193
196
|
LLM_ARCH_UNKNOWN,
|
194
197
|
};
|
195
198
|
|
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
205
208
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206
209
|
{ LLM_ARCH_REFACT, "refact" },
|
207
210
|
{ LLM_ARCH_BLOOM, "bloom" },
|
211
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
208
212
|
};
|
209
213
|
|
210
214
|
enum llm_kv {
|
@@ -251,6 +255,8 @@ enum llm_kv {
|
|
251
255
|
LLM_KV_TOKENIZER_UNK_ID,
|
252
256
|
LLM_KV_TOKENIZER_SEP_ID,
|
253
257
|
LLM_KV_TOKENIZER_PAD_ID,
|
258
|
+
LLM_KV_TOKENIZER_ADD_BOS,
|
259
|
+
LLM_KV_TOKENIZER_ADD_EOS,
|
254
260
|
LLM_KV_TOKENIZER_HF_JSON,
|
255
261
|
LLM_KV_TOKENIZER_RWKV,
|
256
262
|
};
|
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
299
305
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
300
306
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
301
307
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
308
|
+
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
309
|
+
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
302
310
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
303
311
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
304
312
|
};
|
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
493
501
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
494
502
|
},
|
495
503
|
},
|
504
|
+
{
|
505
|
+
LLM_ARCH_STABLELM,
|
506
|
+
{
|
507
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
508
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
509
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
510
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
511
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
512
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
513
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
514
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
515
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
516
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
517
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
518
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
519
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
520
|
+
},
|
521
|
+
},
|
522
|
+
|
496
523
|
{
|
497
524
|
LLM_ARCH_UNKNOWN,
|
498
525
|
{
|
@@ -596,19 +623,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
596
623
|
// llama helpers
|
597
624
|
//
|
598
625
|
|
626
|
+
inline void * llama_host_malloc(size_t n) {
|
599
627
|
#ifdef GGML_USE_CUBLAS
|
600
|
-
|
601
|
-
|
628
|
+
if (ggml_cublas_loaded()) {
|
629
|
+
return ggml_cuda_host_malloc(n);
|
630
|
+
} else {
|
631
|
+
return malloc(n);
|
632
|
+
}
|
633
|
+
#elif GGML_USE_METAL
|
634
|
+
return ggml_metal_host_malloc(n);
|
635
|
+
#elif GGML_USE_CPU_HBM
|
636
|
+
return hbw_malloc(n);
|
637
|
+
#else
|
638
|
+
return malloc(n);
|
639
|
+
#endif
|
640
|
+
}
|
641
|
+
|
642
|
+
inline void llama_host_free(void * ptr) {
|
643
|
+
#ifdef GGML_USE_CUBLAS
|
644
|
+
if (ggml_cublas_loaded()) {
|
645
|
+
return ggml_cuda_host_free(ptr);
|
646
|
+
} else {
|
647
|
+
return free(ptr);
|
648
|
+
}
|
602
649
|
#elif GGML_USE_METAL
|
603
|
-
|
604
|
-
# define llama_host_free(data) ggml_metal_host_free(data)
|
650
|
+
return ggml_metal_host_free(ptr);
|
605
651
|
#elif GGML_USE_CPU_HBM
|
606
|
-
|
607
|
-
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
652
|
+
return hbw_free(ptr);
|
608
653
|
#else
|
609
|
-
|
610
|
-
# define llama_host_free(data) free(data)
|
654
|
+
return free(ptr);
|
611
655
|
#endif
|
656
|
+
}
|
612
657
|
|
613
658
|
#if defined(_WIN32)
|
614
659
|
static std::string llama_format_win_err(DWORD err) {
|
@@ -1037,9 +1082,9 @@ enum e_model {
|
|
1037
1082
|
MODEL_70B,
|
1038
1083
|
};
|
1039
1084
|
|
1040
|
-
static const size_t
|
1041
|
-
static const size_t
|
1042
|
-
static const size_t
|
1085
|
+
static const size_t kiB = 1024;
|
1086
|
+
static const size_t MiB = 1024*kiB;
|
1087
|
+
static const size_t GiB = 1024*MiB;
|
1043
1088
|
|
1044
1089
|
struct llama_hparams {
|
1045
1090
|
bool vocab_only;
|
@@ -1195,9 +1240,11 @@ struct llama_kv_cache {
|
|
1195
1240
|
}
|
1196
1241
|
|
1197
1242
|
#ifdef GGML_USE_CUBLAS
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1243
|
+
if (ggml_cublas_loaded()) {
|
1244
|
+
ggml_cuda_free_data(k);
|
1245
|
+
ggml_cuda_free_data(v);
|
1246
|
+
}
|
1247
|
+
#endif
|
1201
1248
|
}
|
1202
1249
|
};
|
1203
1250
|
|
@@ -1228,6 +1275,9 @@ struct llama_vocab {
|
|
1228
1275
|
id special_sep_id = -1;
|
1229
1276
|
id special_pad_id = -1;
|
1230
1277
|
|
1278
|
+
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
1279
|
+
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
1280
|
+
|
1231
1281
|
id linefeed_id = 13;
|
1232
1282
|
id special_prefix_id = 32007;
|
1233
1283
|
id special_middle_id = 32009;
|
@@ -1297,11 +1347,15 @@ struct llama_model {
|
|
1297
1347
|
}
|
1298
1348
|
|
1299
1349
|
#ifdef GGML_USE_CUBLAS
|
1300
|
-
|
1301
|
-
|
1350
|
+
if (ggml_cublas_loaded()) {
|
1351
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1352
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
1353
|
+
}
|
1354
|
+
ggml_cuda_free_scratch();
|
1302
1355
|
}
|
1303
|
-
|
1304
|
-
|
1356
|
+
#endif
|
1357
|
+
|
1358
|
+
#if defined(GGML_USE_CLBLAST)
|
1305
1359
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1306
1360
|
ggml_cl_free_data(tensors_by_name[i].second);
|
1307
1361
|
}
|
@@ -1413,23 +1467,26 @@ static bool llama_kv_cache_init(
|
|
1413
1467
|
ggml_set_name(cache.v, "cache_v");
|
1414
1468
|
|
1415
1469
|
(void) n_gpu_layers;
|
1470
|
+
|
1416
1471
|
#ifdef GGML_USE_CUBLAS
|
1417
|
-
|
1472
|
+
if (ggml_cublas_loaded()) {
|
1473
|
+
size_t vram_kv_cache = 0;
|
1418
1474
|
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1475
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1476
|
+
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1477
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1478
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1479
|
+
}
|
1480
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1481
|
+
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1482
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1483
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1484
|
+
}
|
1485
|
+
if (vram_kv_cache > 0) {
|
1486
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1487
|
+
}
|
1431
1488
|
}
|
1432
|
-
#endif
|
1489
|
+
#endif
|
1433
1490
|
|
1434
1491
|
return true;
|
1435
1492
|
}
|
@@ -2182,6 +2239,16 @@ static void llm_load_hparams(
|
|
2182
2239
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2183
2240
|
}
|
2184
2241
|
} break;
|
2242
|
+
case LLM_ARCH_STABLELM:
|
2243
|
+
{
|
2244
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2245
|
+
|
2246
|
+
switch (hparams.n_layer) {
|
2247
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2248
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2249
|
+
}
|
2250
|
+
} break;
|
2251
|
+
|
2185
2252
|
default: (void)0;
|
2186
2253
|
}
|
2187
2254
|
|
@@ -2323,6 +2390,23 @@ static void llm_load_vocab(
|
|
2323
2390
|
__func__, key.c_str(), id, old_id);
|
2324
2391
|
id = old_id;
|
2325
2392
|
}
|
2393
|
+
|
2394
|
+
}
|
2395
|
+
|
2396
|
+
// Handle add_bos_token and add_eos_token
|
2397
|
+
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
|
2398
|
+
int kid = gguf_find_key(ctx, key.c_str());
|
2399
|
+
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2400
|
+
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2401
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2402
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2403
|
+
}
|
2404
|
+
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
|
2405
|
+
kid = gguf_find_key(ctx, key.c_str());
|
2406
|
+
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2407
|
+
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2408
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2409
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2326
2410
|
}
|
2327
2411
|
}
|
2328
2412
|
|
@@ -2454,8 +2538,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2454
2538
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2455
2539
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2456
2540
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2457
|
-
if (ml.n_bytes <
|
2458
|
-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,
|
2541
|
+
if (ml.n_bytes < GiB) {
|
2542
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2459
2543
|
} else {
|
2460
2544
|
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2461
2545
|
}
|
@@ -2493,7 +2577,7 @@ static void llm_load_tensors(
|
|
2493
2577
|
|
2494
2578
|
ml.calc_sizes(ctx_size, mmapped_size);
|
2495
2579
|
|
2496
|
-
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f
|
2580
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2497
2581
|
|
2498
2582
|
// create the ggml context
|
2499
2583
|
{
|
@@ -2516,18 +2600,22 @@ static void llm_load_tensors(
|
|
2516
2600
|
}
|
2517
2601
|
|
2518
2602
|
(void) main_gpu;
|
2603
|
+
|
2604
|
+
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
2605
|
+
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
2606
|
+
|
2519
2607
|
#ifdef GGML_USE_CUBLAS
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2608
|
+
if (ggml_cublas_loaded()) {
|
2609
|
+
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
2610
|
+
ggml_cuda_set_main_device(main_gpu);
|
2611
|
+
|
2612
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2613
|
+
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
2614
|
+
}
|
2524
2615
|
#elif defined(GGML_USE_CLBLAST)
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
#else
|
2529
|
-
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
2530
|
-
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
2616
|
+
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
2617
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2618
|
+
llama_backend_offload_split = GGML_BACKEND_GPU;
|
2531
2619
|
#endif
|
2532
2620
|
|
2533
2621
|
// prepare memory for the weights
|
@@ -2554,12 +2642,12 @@ static void llm_load_tensors(
|
|
2554
2642
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2555
2643
|
// on Windows however this is detrimental unless everything is on the GPU
|
2556
2644
|
#ifndef _WIN32
|
2557
|
-
backend_norm =
|
2645
|
+
backend_norm = llama_backend_offload;
|
2558
2646
|
#else
|
2559
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2647
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2560
2648
|
#endif // _WIN32
|
2561
2649
|
|
2562
|
-
backend_output =
|
2650
|
+
backend_output = llama_backend_offload_split;
|
2563
2651
|
} else {
|
2564
2652
|
backend_norm = GGML_BACKEND_CPU;
|
2565
2653
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2583,8 +2671,8 @@ static void llm_load_tensors(
|
|
2583
2671
|
model.layers.resize(n_layer);
|
2584
2672
|
|
2585
2673
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2586
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2587
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2674
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2675
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2588
2676
|
|
2589
2677
|
auto & layer = model.layers[i];
|
2590
2678
|
|
@@ -2620,12 +2708,12 @@ static void llm_load_tensors(
|
|
2620
2708
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2621
2709
|
// on Windows however this is detrimental unless everything is on the GPU
|
2622
2710
|
#ifndef _WIN32
|
2623
|
-
backend_norm =
|
2711
|
+
backend_norm = llama_backend_offload;
|
2624
2712
|
#else
|
2625
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2713
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2626
2714
|
#endif // _WIN32
|
2627
2715
|
|
2628
|
-
backend_output =
|
2716
|
+
backend_output = llama_backend_offload_split;
|
2629
2717
|
} else {
|
2630
2718
|
backend_norm = GGML_BACKEND_CPU;
|
2631
2719
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2649,8 +2737,8 @@ static void llm_load_tensors(
|
|
2649
2737
|
model.layers.resize(n_layer);
|
2650
2738
|
|
2651
2739
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2652
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2653
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2740
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2741
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2654
2742
|
|
2655
2743
|
auto & layer = model.layers[i];
|
2656
2744
|
|
@@ -2690,12 +2778,12 @@ static void llm_load_tensors(
|
|
2690
2778
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2691
2779
|
// on Windows however this is detrimental unless everything is on the GPU
|
2692
2780
|
#ifndef _WIN32
|
2693
|
-
backend_norm =
|
2781
|
+
backend_norm = llama_backend_offload;
|
2694
2782
|
#else
|
2695
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2783
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2696
2784
|
#endif // _WIN32
|
2697
2785
|
|
2698
|
-
backend_output =
|
2786
|
+
backend_output = llama_backend_offload_split;
|
2699
2787
|
} else {
|
2700
2788
|
backend_norm = GGML_BACKEND_CPU;
|
2701
2789
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2721,8 +2809,8 @@ static void llm_load_tensors(
|
|
2721
2809
|
model.layers.resize(n_layer);
|
2722
2810
|
|
2723
2811
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2724
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2725
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2812
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2813
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2726
2814
|
|
2727
2815
|
auto & layer = model.layers[i];
|
2728
2816
|
|
@@ -2767,12 +2855,12 @@ static void llm_load_tensors(
|
|
2767
2855
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2768
2856
|
// on Windows however this is detrimental unless everything is on the GPU
|
2769
2857
|
#ifndef _WIN32
|
2770
|
-
backend_norm =
|
2858
|
+
backend_norm = llama_backend_offload;
|
2771
2859
|
#else
|
2772
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2860
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2773
2861
|
#endif // _WIN32
|
2774
2862
|
|
2775
|
-
backend_output =
|
2863
|
+
backend_output = llama_backend_offload_split;
|
2776
2864
|
} else {
|
2777
2865
|
backend_norm = GGML_BACKEND_CPU;
|
2778
2866
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2798,8 +2886,8 @@ static void llm_load_tensors(
|
|
2798
2886
|
model.layers.resize(n_layer);
|
2799
2887
|
|
2800
2888
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2801
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2802
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2889
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2890
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2803
2891
|
|
2804
2892
|
auto & layer = model.layers[i];
|
2805
2893
|
|
@@ -2841,15 +2929,22 @@ static void llm_load_tensors(
|
|
2841
2929
|
ggml_backend_type backend_output;
|
2842
2930
|
|
2843
2931
|
if (n_gpu_layers > int(n_layer)) {
|
2932
|
+
#ifdef GGML_USE_CUBLAS
|
2933
|
+
if (n_gpu_layers > int(n_layer + 1)) {
|
2934
|
+
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
2935
|
+
__func__, n_layer + 1);
|
2936
|
+
throw std::runtime_error("Persimmon CUDA offload failed");
|
2937
|
+
}
|
2938
|
+
#endif
|
2844
2939
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2845
2940
|
// on Windows however this is detrimental unless everything is on the GPU
|
2846
2941
|
#ifndef _WIN32
|
2847
|
-
backend_norm =
|
2942
|
+
backend_norm = llama_backend_offload;
|
2848
2943
|
#else
|
2849
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2944
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2850
2945
|
#endif // _WIN32
|
2851
2946
|
|
2852
|
-
backend_output =
|
2947
|
+
backend_output = llama_backend_offload_split;
|
2853
2948
|
} else {
|
2854
2949
|
backend_norm = GGML_BACKEND_CPU;
|
2855
2950
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2872,8 +2967,8 @@ static void llm_load_tensors(
|
|
2872
2967
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
2873
2968
|
model.layers.resize(n_layer);
|
2874
2969
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2875
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2876
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2970
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
|
2971
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
|
2877
2972
|
auto & layer = model.layers[i];
|
2878
2973
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2879
2974
|
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
@@ -2910,12 +3005,12 @@ static void llm_load_tensors(
|
|
2910
3005
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2911
3006
|
// on Windows however this is detrimental unless everything is on the GPU
|
2912
3007
|
#ifndef _WIN32
|
2913
|
-
backend_norm =
|
3008
|
+
backend_norm = llama_backend_offload;
|
2914
3009
|
#else
|
2915
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
3010
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2916
3011
|
#endif // _WIN32
|
2917
3012
|
|
2918
|
-
backend_output =
|
3013
|
+
backend_output = llama_backend_offload_split;
|
2919
3014
|
} else {
|
2920
3015
|
backend_norm = GGML_BACKEND_CPU;
|
2921
3016
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2941,8 +3036,8 @@ static void llm_load_tensors(
|
|
2941
3036
|
model.layers.resize(n_layer);
|
2942
3037
|
|
2943
3038
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2944
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2945
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3039
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3040
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2946
3041
|
|
2947
3042
|
auto & layer = model.layers[i];
|
2948
3043
|
|
@@ -2988,12 +3083,12 @@ static void llm_load_tensors(
|
|
2988
3083
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2989
3084
|
// on Windows however this is detrimental unless everything is on the GPU
|
2990
3085
|
#ifndef _WIN32
|
2991
|
-
backend_norm =
|
3086
|
+
backend_norm = llama_backend_offload;
|
2992
3087
|
#else
|
2993
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
3088
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2994
3089
|
#endif // _WIN32
|
2995
3090
|
|
2996
|
-
backend_output =
|
3091
|
+
backend_output = llama_backend_offload_split;
|
2997
3092
|
} else {
|
2998
3093
|
backend_norm = GGML_BACKEND_CPU;
|
2999
3094
|
backend_output = GGML_BACKEND_CPU;
|
@@ -3017,8 +3112,8 @@ static void llm_load_tensors(
|
|
3017
3112
|
model.layers.resize(n_layer);
|
3018
3113
|
|
3019
3114
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
3020
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3021
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3115
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3116
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3022
3117
|
|
3023
3118
|
auto & layer = model.layers[i];
|
3024
3119
|
|
@@ -3042,6 +3137,81 @@ static void llm_load_tensors(
|
|
3042
3137
|
}
|
3043
3138
|
}
|
3044
3139
|
} break;
|
3140
|
+
case LLM_ARCH_STABLELM:
|
3141
|
+
{
|
3142
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3143
|
+
|
3144
|
+
// output
|
3145
|
+
{
|
3146
|
+
ggml_backend_type backend_norm;
|
3147
|
+
ggml_backend_type backend_output;
|
3148
|
+
|
3149
|
+
if (n_gpu_layers > int(n_layer)) {
|
3150
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3151
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
3152
|
+
#ifndef _WIN32
|
3153
|
+
backend_norm = llama_backend_offload;
|
3154
|
+
#else
|
3155
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3156
|
+
#endif // _WIN32
|
3157
|
+
|
3158
|
+
backend_output = llama_backend_offload_split;
|
3159
|
+
} else {
|
3160
|
+
backend_norm = GGML_BACKEND_CPU;
|
3161
|
+
backend_output = GGML_BACKEND_CPU;
|
3162
|
+
}
|
3163
|
+
|
3164
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3165
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3166
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3167
|
+
|
3168
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3169
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3170
|
+
}
|
3171
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3172
|
+
vram_weights += ggml_nbytes(model.output);
|
3173
|
+
}
|
3174
|
+
}
|
3175
|
+
|
3176
|
+
const uint32_t n_ff = hparams.n_ff;
|
3177
|
+
|
3178
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3179
|
+
|
3180
|
+
model.layers.resize(n_layer);
|
3181
|
+
|
3182
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3183
|
+
/*
|
3184
|
+
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
|
3185
|
+
*/
|
3186
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3187
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3188
|
+
|
3189
|
+
auto & layer = model.layers[i];
|
3190
|
+
|
3191
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3192
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3193
|
+
|
3194
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
3195
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3196
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3197
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3198
|
+
|
3199
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3200
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
3201
|
+
|
3202
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3203
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3204
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3205
|
+
|
3206
|
+
if (backend == GGML_BACKEND_GPU) {
|
3207
|
+
vram_weights +=
|
3208
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3209
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3210
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3211
|
+
}
|
3212
|
+
}
|
3213
|
+
} break;
|
3214
|
+
|
3045
3215
|
default:
|
3046
3216
|
throw std::runtime_error("unknown architecture");
|
3047
3217
|
}
|
@@ -3056,7 +3226,7 @@ static void llm_load_tensors(
|
|
3056
3226
|
ctx_size +
|
3057
3227
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3058
3228
|
|
3059
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f
|
3229
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
3060
3230
|
|
3061
3231
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3062
3232
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -3075,7 +3245,7 @@ static void llm_load_tensors(
|
|
3075
3245
|
#endif // GGML_USE_CUBLAS
|
3076
3246
|
|
3077
3247
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3078
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f
|
3248
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3079
3249
|
#else
|
3080
3250
|
(void) n_gpu_layers;
|
3081
3251
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -3575,7 +3745,7 @@ struct llm_build_context {
|
|
3575
3745
|
}
|
3576
3746
|
|
3577
3747
|
struct ggml_cgraph * build_llama() {
|
3578
|
-
struct ggml_cgraph * gf =
|
3748
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3579
3749
|
|
3580
3750
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3581
3751
|
|
@@ -3687,7 +3857,7 @@ struct llm_build_context {
|
|
3687
3857
|
}
|
3688
3858
|
|
3689
3859
|
struct ggml_cgraph * build_baichuan() {
|
3690
|
-
struct ggml_cgraph * gf =
|
3860
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3691
3861
|
|
3692
3862
|
struct ggml_tensor * cur;
|
3693
3863
|
struct ggml_tensor * inpL;
|
@@ -3807,7 +3977,7 @@ struct llm_build_context {
|
|
3807
3977
|
}
|
3808
3978
|
|
3809
3979
|
struct ggml_cgraph * build_falcon() {
|
3810
|
-
struct ggml_cgraph * gf =
|
3980
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3811
3981
|
|
3812
3982
|
struct ggml_tensor * cur;
|
3813
3983
|
struct ggml_tensor * inpL;
|
@@ -3929,7 +4099,7 @@ struct llm_build_context {
|
|
3929
4099
|
}
|
3930
4100
|
|
3931
4101
|
struct ggml_cgraph * build_starcoder() {
|
3932
|
-
struct ggml_cgraph * gf =
|
4102
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3933
4103
|
|
3934
4104
|
struct ggml_tensor * cur;
|
3935
4105
|
struct ggml_tensor * pos;
|
@@ -4028,7 +4198,7 @@ struct llm_build_context {
|
|
4028
4198
|
}
|
4029
4199
|
|
4030
4200
|
struct ggml_cgraph * build_persimmon() {
|
4031
|
-
struct ggml_cgraph * gf =
|
4201
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4032
4202
|
|
4033
4203
|
const int64_t n_rot = n_embd_head / 2;
|
4034
4204
|
|
@@ -4173,7 +4343,7 @@ struct llm_build_context {
|
|
4173
4343
|
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4174
4344
|
cb(Kcur, "Kcur", il);
|
4175
4345
|
|
4176
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur,
|
4346
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4177
4347
|
cb(Q, "Q", il);
|
4178
4348
|
|
4179
4349
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
@@ -4238,7 +4408,7 @@ struct llm_build_context {
|
|
4238
4408
|
}
|
4239
4409
|
|
4240
4410
|
struct ggml_cgraph * build_refact() {
|
4241
|
-
struct ggml_cgraph * gf =
|
4411
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4242
4412
|
|
4243
4413
|
struct ggml_tensor * cur;
|
4244
4414
|
struct ggml_tensor * inpL;
|
@@ -4329,7 +4499,7 @@ struct llm_build_context {
|
|
4329
4499
|
}
|
4330
4500
|
|
4331
4501
|
struct ggml_cgraph * build_bloom() {
|
4332
|
-
struct ggml_cgraph * gf =
|
4502
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4333
4503
|
|
4334
4504
|
struct ggml_tensor * cur;
|
4335
4505
|
struct ggml_tensor * inpL;
|
@@ -4423,7 +4593,7 @@ struct llm_build_context {
|
|
4423
4593
|
}
|
4424
4594
|
|
4425
4595
|
struct ggml_cgraph * build_mpt() {
|
4426
|
-
struct ggml_cgraph * gf =
|
4596
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4427
4597
|
|
4428
4598
|
struct ggml_tensor * cur;
|
4429
4599
|
struct ggml_tensor * inpL;
|
@@ -4520,6 +4690,177 @@ struct llm_build_context {
|
|
4520
4690
|
|
4521
4691
|
return gf;
|
4522
4692
|
}
|
4693
|
+
|
4694
|
+
struct ggml_cgraph * build_stablelm() {
|
4695
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4696
|
+
|
4697
|
+
struct ggml_tensor * cur;
|
4698
|
+
struct ggml_tensor * inpL;
|
4699
|
+
|
4700
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4701
|
+
cb(inpL, "inp_embd", -1);
|
4702
|
+
|
4703
|
+
// inp_pos - contains the positions
|
4704
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4705
|
+
cb(inp_pos, "inp_pos", -1);
|
4706
|
+
|
4707
|
+
// KQ_scale
|
4708
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4709
|
+
cb(KQ_scale, "KQ_scale", -1);
|
4710
|
+
|
4711
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4712
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4713
|
+
cb(KQ_mask, "KQ_mask", -1);
|
4714
|
+
|
4715
|
+
// shift the entire K-cache if needed
|
4716
|
+
if (do_rope_shift) {
|
4717
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
|
4718
|
+
}
|
4719
|
+
|
4720
|
+
for (int il = 0; il < n_layer; ++il) {
|
4721
|
+
struct ggml_tensor * inpSA = inpL;
|
4722
|
+
|
4723
|
+
// norm
|
4724
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
4725
|
+
model.layers[il].attn_norm,
|
4726
|
+
model.layers[il].attn_norm_b,
|
4727
|
+
LLM_NORM, cb, il);
|
4728
|
+
cb(cur, "attn_norm", il);
|
4729
|
+
|
4730
|
+
// self-attention
|
4731
|
+
{
|
4732
|
+
// compute Q and K and RoPE them
|
4733
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4734
|
+
cb(tmpq, "tmpq", il);
|
4735
|
+
|
4736
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4737
|
+
cb(tmpk, "tmpk", il);
|
4738
|
+
|
4739
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4740
|
+
cb(Vcur, "Vcur", il);
|
4741
|
+
|
4742
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
4743
|
+
struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
|
4744
|
+
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
4745
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4746
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4747
|
+
0
|
4748
|
+
));
|
4749
|
+
cb(qrot, "qrot", il);
|
4750
|
+
|
4751
|
+
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
4752
|
+
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
4753
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4754
|
+
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
4755
|
+
0
|
4756
|
+
));
|
4757
|
+
cb(krot, "krot", il);
|
4758
|
+
|
4759
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4760
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
4761
|
+
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
4762
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4763
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4764
|
+
ggml_element_size(tmpq) * hparams.n_rot
|
4765
|
+
);
|
4766
|
+
cb(qpass, "qpass", il);
|
4767
|
+
|
4768
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
4769
|
+
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
4770
|
+
ggml_element_size(tmpk) * (n_embd_head),
|
4771
|
+
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
4772
|
+
ggml_element_size(tmpk) * hparams.n_rot
|
4773
|
+
);
|
4774
|
+
cb(kpass, "kpass", il);
|
4775
|
+
|
4776
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4777
|
+
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4778
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4779
|
+
);
|
4780
|
+
cb(qrotated, "qrotated", il);
|
4781
|
+
|
4782
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
4783
|
+
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4784
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4785
|
+
);
|
4786
|
+
cb(krotated, "krotated", il);
|
4787
|
+
|
4788
|
+
// ggml currently only supports concatenation on dim=2
|
4789
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
4790
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4791
|
+
cb(qrotated, "qrotated", il);
|
4792
|
+
|
4793
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4794
|
+
cb(krotated, "krotated", il);
|
4795
|
+
|
4796
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4797
|
+
cb(qpass, "qpass", il);
|
4798
|
+
|
4799
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4800
|
+
cb(kpass, "kpass", il);
|
4801
|
+
|
4802
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4803
|
+
cb(Qcur, "Qcur", il);
|
4804
|
+
|
4805
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4806
|
+
cb(Kcur, "Kcur", il);
|
4807
|
+
|
4808
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4809
|
+
cb(Q, "Q", il);
|
4810
|
+
|
4811
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4812
|
+
cb(Kcur, "Kcur", il);
|
4813
|
+
|
4814
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
|
+
|
4816
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4817
|
+
model.layers[il].wo, NULL,
|
4818
|
+
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4819
|
+
cb(cur, "kqv_out", il);
|
4820
|
+
}
|
4821
|
+
|
4822
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
4823
|
+
cb(ffn_inp, "ffn_inp", il);
|
4824
|
+
|
4825
|
+
// feed-forward network
|
4826
|
+
{
|
4827
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4828
|
+
model.layers[il].ffn_norm,
|
4829
|
+
model.layers[il].ffn_norm_b,
|
4830
|
+
LLM_NORM, cb, il);
|
4831
|
+
cb(cur, "ffn_norm", il);
|
4832
|
+
|
4833
|
+
cur = llm_build_ffn(ctx0, cur,
|
4834
|
+
model.layers[il].ffn_up, NULL,
|
4835
|
+
model.layers[il].ffn_gate, NULL,
|
4836
|
+
model.layers[il].ffn_down, NULL,
|
4837
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4838
|
+
cb(cur, "ffn_out", il);
|
4839
|
+
}
|
4840
|
+
|
4841
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
4842
|
+
cb(cur, "l_out", il);
|
4843
|
+
|
4844
|
+
// input for next layer
|
4845
|
+
inpL = cur;
|
4846
|
+
}
|
4847
|
+
|
4848
|
+
cur = inpL;
|
4849
|
+
|
4850
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
4851
|
+
model.output_norm,
|
4852
|
+
model.output_norm_b,
|
4853
|
+
LLM_NORM, cb, -1);
|
4854
|
+
cb(cur, "result_norm", -1);
|
4855
|
+
|
4856
|
+
// lm_head
|
4857
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4858
|
+
cb(cur, "result_output", -1);
|
4859
|
+
|
4860
|
+
ggml_build_forward_expand(gf, cur);
|
4861
|
+
|
4862
|
+
return gf;
|
4863
|
+
}
|
4523
4864
|
};
|
4524
4865
|
|
4525
4866
|
//
|
@@ -4989,6 +5330,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4989
5330
|
{
|
4990
5331
|
result = llm.build_mpt();
|
4991
5332
|
} break;
|
5333
|
+
case LLM_ARCH_STABLELM:
|
5334
|
+
{
|
5335
|
+
result = llm.build_stablelm();
|
5336
|
+
} break;
|
4992
5337
|
default:
|
4993
5338
|
GGML_ASSERT(false);
|
4994
5339
|
}
|
@@ -5159,11 +5504,13 @@ static int llama_decode_internal(
|
|
5159
5504
|
|
5160
5505
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
5161
5506
|
const bool full_offload_supported =
|
5162
|
-
model.arch == LLM_ARCH_LLAMA
|
5163
|
-
model.arch == LLM_ARCH_BAICHUAN
|
5164
|
-
model.arch == LLM_ARCH_FALCON
|
5165
|
-
model.arch == LLM_ARCH_REFACT
|
5166
|
-
model.arch == LLM_ARCH_MPT
|
5507
|
+
model.arch == LLM_ARCH_LLAMA ||
|
5508
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
5509
|
+
model.arch == LLM_ARCH_FALCON ||
|
5510
|
+
model.arch == LLM_ARCH_REFACT ||
|
5511
|
+
model.arch == LLM_ARCH_MPT ||
|
5512
|
+
model.arch == LLM_ARCH_STARCODER ||
|
5513
|
+
model.arch == LLM_ARCH_STABLELM;
|
5167
5514
|
|
5168
5515
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5169
5516
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
@@ -5955,7 +6302,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
5955
6302
|
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
5956
6303
|
// and passing 'add space prefix' as bool argument
|
5957
6304
|
//
|
5958
|
-
auto raw_text =
|
6305
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6306
|
+
if (&fragment == &fragment_buffer.front()) {
|
6307
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
6308
|
+
}
|
5959
6309
|
|
5960
6310
|
#ifdef PRETOKENIZERDEBUG
|
5961
6311
|
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
@@ -7607,7 +7957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7607
7957
|
workers.clear();
|
7608
7958
|
}
|
7609
7959
|
|
7610
|
-
LLAMA_LOG_INFO("size = %8.2f
|
7960
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
7611
7961
|
int64_t tot_count = 0;
|
7612
7962
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
7613
7963
|
hist_all[i] += hist_cur[i];
|
@@ -7977,7 +8327,7 @@ struct llama_context_params llama_context_default_params() {
|
|
7977
8327
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
7978
8328
|
/*.rope_freq_base =*/ 0.0f,
|
7979
8329
|
/*.rope_freq_scale =*/ 0.0f,
|
7980
|
-
/*.yarn_ext_factor =*/
|
8330
|
+
/*.yarn_ext_factor =*/ -1.0f,
|
7981
8331
|
/*.yarn_attn_factor =*/ 1.0f,
|
7982
8332
|
/*.yarn_beta_fast =*/ 32.0f,
|
7983
8333
|
/*.yarn_beta_slow =*/ 1.0f,
|
@@ -8120,7 +8470,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8120
8470
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
8121
8471
|
}
|
8122
8472
|
|
8123
|
-
if (
|
8473
|
+
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
8124
8474
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
8125
8475
|
}
|
8126
8476
|
|
@@ -8147,7 +8497,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8147
8497
|
|
8148
8498
|
{
|
8149
8499
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
8150
|
-
LLAMA_LOG_INFO("%s: kv self size = %7.2f
|
8500
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
8151
8501
|
}
|
8152
8502
|
|
8153
8503
|
// resized during inference
|
@@ -8164,7 +8514,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8164
8514
|
{
|
8165
8515
|
static const size_t tensor_alignment = 32;
|
8166
8516
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
8167
|
-
ctx->buf_compute.resize(ggml_tensor_overhead()*
|
8517
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
8168
8518
|
|
8169
8519
|
// create measure allocator
|
8170
8520
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
@@ -8192,7 +8542,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8192
8542
|
// measure memory requirements for the graph
|
8193
8543
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
8194
8544
|
|
8195
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f
|
8545
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
8196
8546
|
|
8197
8547
|
// recreate allocator with exact memory requirements
|
8198
8548
|
ggml_allocr_free(ctx->alloc);
|
@@ -8206,7 +8556,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8206
8556
|
#endif
|
8207
8557
|
#ifdef GGML_USE_CUBLAS
|
8208
8558
|
ggml_cuda_set_scratch_size(alloc_size);
|
8209
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f
|
8559
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
8210
8560
|
|
8211
8561
|
// calculate total VRAM usage
|
8212
8562
|
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
@@ -8226,10 +8576,10 @@ struct llama_context * llama_new_context_with_model(
|
|
8226
8576
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8227
8577
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
8228
8578
|
|
8229
|
-
LLAMA_LOG_INFO("%s: total VRAM used: %.2f
|
8579
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
8230
8580
|
total_vram_size / 1024.0 / 1024.0,
|
8231
8581
|
model_vram_size / 1024.0 / 1024.0,
|
8232
|
-
ctx_vram_size
|
8582
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
8233
8583
|
#endif
|
8234
8584
|
}
|
8235
8585
|
|
@@ -8250,7 +8600,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8250
8600
|
|
8251
8601
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
8252
8602
|
|
8253
|
-
LLAMA_LOG_INFO("%s: max tensor size = %8.2f
|
8603
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
8254
8604
|
|
8255
8605
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
8256
8606
|
if (!(result)) { \
|
@@ -8553,8 +8903,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8553
8903
|
if (kv_buf_size) {
|
8554
8904
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8555
8905
|
|
8556
|
-
ggml_context * cpy_ctx = ggml_init({
|
8557
|
-
ggml_cgraph gf
|
8906
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
8907
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8558
8908
|
|
8559
8909
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8560
8910
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
@@ -8572,9 +8922,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8572
8922
|
kv_head, n_embd, n_layer,
|
8573
8923
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8574
8924
|
|
8575
|
-
ggml_build_forward_expand(
|
8576
|
-
ggml_build_forward_expand(
|
8577
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
8925
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
8926
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
8927
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8578
8928
|
|
8579
8929
|
ggml_free(cpy_ctx);
|
8580
8930
|
|
@@ -8681,8 +9031,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8681
9031
|
|
8682
9032
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8683
9033
|
|
8684
|
-
ggml_context * cpy_ctx = ggml_init({
|
8685
|
-
ggml_cgraph gf
|
9034
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9035
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8686
9036
|
|
8687
9037
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8688
9038
|
kin3d->data = (void *) inp;
|
@@ -8700,9 +9050,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8700
9050
|
kv_head, n_embd, n_layer,
|
8701
9051
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8702
9052
|
|
8703
|
-
ggml_build_forward_expand(
|
8704
|
-
ggml_build_forward_expand(
|
8705
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
9053
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9054
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9055
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8706
9056
|
|
8707
9057
|
ggml_free(cpy_ctx);
|
8708
9058
|
}
|
@@ -8957,6 +9307,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
8957
9307
|
return model->vocab.linefeed_id;
|
8958
9308
|
}
|
8959
9309
|
|
9310
|
+
int llama_add_bos_token(const struct llama_model * model) {
|
9311
|
+
return model->vocab.special_add_bos;
|
9312
|
+
}
|
9313
|
+
|
9314
|
+
int llama_add_eos_token(const struct llama_model * model) {
|
9315
|
+
return model->vocab.special_add_eos;
|
9316
|
+
}
|
9317
|
+
|
8960
9318
|
llama_token llama_token_prefix(const struct llama_model * model) {
|
8961
9319
|
return model->vocab.special_prefix_id;
|
8962
9320
|
}
|