llama_cpp 0.9.1 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -91,6 +91,8 @@
|
|
91
91
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
92
92
|
#endif
|
93
93
|
|
94
|
+
#define LLAMA_MAX_NODES 4096
|
95
|
+
|
94
96
|
//
|
95
97
|
// logging
|
96
98
|
//
|
@@ -190,6 +192,7 @@ enum llm_arch {
|
|
190
192
|
LLM_ARCH_PERSIMMON,
|
191
193
|
LLM_ARCH_REFACT,
|
192
194
|
LLM_ARCH_BLOOM,
|
195
|
+
LLM_ARCH_STABLELM,
|
193
196
|
LLM_ARCH_UNKNOWN,
|
194
197
|
};
|
195
198
|
|
@@ -205,6 +208,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
205
208
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206
209
|
{ LLM_ARCH_REFACT, "refact" },
|
207
210
|
{ LLM_ARCH_BLOOM, "bloom" },
|
211
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
208
212
|
};
|
209
213
|
|
210
214
|
enum llm_kv {
|
@@ -251,6 +255,8 @@ enum llm_kv {
|
|
251
255
|
LLM_KV_TOKENIZER_UNK_ID,
|
252
256
|
LLM_KV_TOKENIZER_SEP_ID,
|
253
257
|
LLM_KV_TOKENIZER_PAD_ID,
|
258
|
+
LLM_KV_TOKENIZER_ADD_BOS,
|
259
|
+
LLM_KV_TOKENIZER_ADD_EOS,
|
254
260
|
LLM_KV_TOKENIZER_HF_JSON,
|
255
261
|
LLM_KV_TOKENIZER_RWKV,
|
256
262
|
};
|
@@ -299,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
299
305
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
300
306
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
301
307
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
308
|
+
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
309
|
+
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
302
310
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
303
311
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
304
312
|
};
|
@@ -493,6 +501,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
493
501
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
494
502
|
},
|
495
503
|
},
|
504
|
+
{
|
505
|
+
LLM_ARCH_STABLELM,
|
506
|
+
{
|
507
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
508
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
509
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
510
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
511
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
512
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
513
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
514
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
515
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
516
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
517
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
518
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
519
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
520
|
+
},
|
521
|
+
},
|
522
|
+
|
496
523
|
{
|
497
524
|
LLM_ARCH_UNKNOWN,
|
498
525
|
{
|
@@ -596,19 +623,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
596
623
|
// llama helpers
|
597
624
|
//
|
598
625
|
|
626
|
+
inline void * llama_host_malloc(size_t n) {
|
599
627
|
#ifdef GGML_USE_CUBLAS
|
600
|
-
|
601
|
-
|
628
|
+
if (ggml_cublas_loaded()) {
|
629
|
+
return ggml_cuda_host_malloc(n);
|
630
|
+
} else {
|
631
|
+
return malloc(n);
|
632
|
+
}
|
633
|
+
#elif GGML_USE_METAL
|
634
|
+
return ggml_metal_host_malloc(n);
|
635
|
+
#elif GGML_USE_CPU_HBM
|
636
|
+
return hbw_malloc(n);
|
637
|
+
#else
|
638
|
+
return malloc(n);
|
639
|
+
#endif
|
640
|
+
}
|
641
|
+
|
642
|
+
inline void llama_host_free(void * ptr) {
|
643
|
+
#ifdef GGML_USE_CUBLAS
|
644
|
+
if (ggml_cublas_loaded()) {
|
645
|
+
return ggml_cuda_host_free(ptr);
|
646
|
+
} else {
|
647
|
+
return free(ptr);
|
648
|
+
}
|
602
649
|
#elif GGML_USE_METAL
|
603
|
-
|
604
|
-
# define llama_host_free(data) ggml_metal_host_free(data)
|
650
|
+
return ggml_metal_host_free(ptr);
|
605
651
|
#elif GGML_USE_CPU_HBM
|
606
|
-
|
607
|
-
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
652
|
+
return hbw_free(ptr);
|
608
653
|
#else
|
609
|
-
|
610
|
-
# define llama_host_free(data) free(data)
|
654
|
+
return free(ptr);
|
611
655
|
#endif
|
656
|
+
}
|
612
657
|
|
613
658
|
#if defined(_WIN32)
|
614
659
|
static std::string llama_format_win_err(DWORD err) {
|
@@ -1037,9 +1082,9 @@ enum e_model {
|
|
1037
1082
|
MODEL_70B,
|
1038
1083
|
};
|
1039
1084
|
|
1040
|
-
static const size_t
|
1041
|
-
static const size_t
|
1042
|
-
static const size_t
|
1085
|
+
static const size_t kiB = 1024;
|
1086
|
+
static const size_t MiB = 1024*kiB;
|
1087
|
+
static const size_t GiB = 1024*MiB;
|
1043
1088
|
|
1044
1089
|
struct llama_hparams {
|
1045
1090
|
bool vocab_only;
|
@@ -1195,9 +1240,11 @@ struct llama_kv_cache {
|
|
1195
1240
|
}
|
1196
1241
|
|
1197
1242
|
#ifdef GGML_USE_CUBLAS
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1243
|
+
if (ggml_cublas_loaded()) {
|
1244
|
+
ggml_cuda_free_data(k);
|
1245
|
+
ggml_cuda_free_data(v);
|
1246
|
+
}
|
1247
|
+
#endif
|
1201
1248
|
}
|
1202
1249
|
};
|
1203
1250
|
|
@@ -1228,6 +1275,9 @@ struct llama_vocab {
|
|
1228
1275
|
id special_sep_id = -1;
|
1229
1276
|
id special_pad_id = -1;
|
1230
1277
|
|
1278
|
+
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
1279
|
+
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
1280
|
+
|
1231
1281
|
id linefeed_id = 13;
|
1232
1282
|
id special_prefix_id = 32007;
|
1233
1283
|
id special_middle_id = 32009;
|
@@ -1297,11 +1347,15 @@ struct llama_model {
|
|
1297
1347
|
}
|
1298
1348
|
|
1299
1349
|
#ifdef GGML_USE_CUBLAS
|
1300
|
-
|
1301
|
-
|
1350
|
+
if (ggml_cublas_loaded()) {
|
1351
|
+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1352
|
+
ggml_cuda_free_data(tensors_by_name[i].second);
|
1353
|
+
}
|
1354
|
+
ggml_cuda_free_scratch();
|
1302
1355
|
}
|
1303
|
-
|
1304
|
-
|
1356
|
+
#endif
|
1357
|
+
|
1358
|
+
#if defined(GGML_USE_CLBLAST)
|
1305
1359
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1306
1360
|
ggml_cl_free_data(tensors_by_name[i].second);
|
1307
1361
|
}
|
@@ -1413,23 +1467,26 @@ static bool llama_kv_cache_init(
|
|
1413
1467
|
ggml_set_name(cache.v, "cache_v");
|
1414
1468
|
|
1415
1469
|
(void) n_gpu_layers;
|
1470
|
+
|
1416
1471
|
#ifdef GGML_USE_CUBLAS
|
1417
|
-
|
1472
|
+
if (ggml_cublas_loaded()) {
|
1473
|
+
size_t vram_kv_cache = 0;
|
1418
1474
|
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1475
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1476
|
+
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1477
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1478
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1479
|
+
}
|
1480
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1481
|
+
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1482
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1483
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1484
|
+
}
|
1485
|
+
if (vram_kv_cache > 0) {
|
1486
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1487
|
+
}
|
1431
1488
|
}
|
1432
|
-
#endif
|
1489
|
+
#endif
|
1433
1490
|
|
1434
1491
|
return true;
|
1435
1492
|
}
|
@@ -2182,6 +2239,16 @@ static void llm_load_hparams(
|
|
2182
2239
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2183
2240
|
}
|
2184
2241
|
} break;
|
2242
|
+
case LLM_ARCH_STABLELM:
|
2243
|
+
{
|
2244
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2245
|
+
|
2246
|
+
switch (hparams.n_layer) {
|
2247
|
+
case 32: model.type = e_model::MODEL_3B; break;
|
2248
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2249
|
+
}
|
2250
|
+
} break;
|
2251
|
+
|
2185
2252
|
default: (void)0;
|
2186
2253
|
}
|
2187
2254
|
|
@@ -2323,6 +2390,23 @@ static void llm_load_vocab(
|
|
2323
2390
|
__func__, key.c_str(), id, old_id);
|
2324
2391
|
id = old_id;
|
2325
2392
|
}
|
2393
|
+
|
2394
|
+
}
|
2395
|
+
|
2396
|
+
// Handle add_bos_token and add_eos_token
|
2397
|
+
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
|
2398
|
+
int kid = gguf_find_key(ctx, key.c_str());
|
2399
|
+
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2400
|
+
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2401
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2402
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2403
|
+
}
|
2404
|
+
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
|
2405
|
+
kid = gguf_find_key(ctx, key.c_str());
|
2406
|
+
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2407
|
+
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2408
|
+
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2409
|
+
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2326
2410
|
}
|
2327
2411
|
}
|
2328
2412
|
|
@@ -2454,8 +2538,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2454
2538
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2455
2539
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2456
2540
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2457
|
-
if (ml.n_bytes <
|
2458
|
-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,
|
2541
|
+
if (ml.n_bytes < GiB) {
|
2542
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2459
2543
|
} else {
|
2460
2544
|
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
2461
2545
|
}
|
@@ -2493,7 +2577,7 @@ static void llm_load_tensors(
|
|
2493
2577
|
|
2494
2578
|
ml.calc_sizes(ctx_size, mmapped_size);
|
2495
2579
|
|
2496
|
-
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f
|
2580
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
2497
2581
|
|
2498
2582
|
// create the ggml context
|
2499
2583
|
{
|
@@ -2516,18 +2600,22 @@ static void llm_load_tensors(
|
|
2516
2600
|
}
|
2517
2601
|
|
2518
2602
|
(void) main_gpu;
|
2603
|
+
|
2604
|
+
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
2605
|
+
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
2606
|
+
|
2519
2607
|
#ifdef GGML_USE_CUBLAS
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2608
|
+
if (ggml_cublas_loaded()) {
|
2609
|
+
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
2610
|
+
ggml_cuda_set_main_device(main_gpu);
|
2611
|
+
|
2612
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2613
|
+
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
2614
|
+
}
|
2524
2615
|
#elif defined(GGML_USE_CLBLAST)
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
#else
|
2529
|
-
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
2530
|
-
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
2616
|
+
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
2617
|
+
llama_backend_offload = GGML_BACKEND_GPU;
|
2618
|
+
llama_backend_offload_split = GGML_BACKEND_GPU;
|
2531
2619
|
#endif
|
2532
2620
|
|
2533
2621
|
// prepare memory for the weights
|
@@ -2554,12 +2642,12 @@ static void llm_load_tensors(
|
|
2554
2642
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2555
2643
|
// on Windows however this is detrimental unless everything is on the GPU
|
2556
2644
|
#ifndef _WIN32
|
2557
|
-
backend_norm =
|
2645
|
+
backend_norm = llama_backend_offload;
|
2558
2646
|
#else
|
2559
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2647
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2560
2648
|
#endif // _WIN32
|
2561
2649
|
|
2562
|
-
backend_output =
|
2650
|
+
backend_output = llama_backend_offload_split;
|
2563
2651
|
} else {
|
2564
2652
|
backend_norm = GGML_BACKEND_CPU;
|
2565
2653
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2583,8 +2671,8 @@ static void llm_load_tensors(
|
|
2583
2671
|
model.layers.resize(n_layer);
|
2584
2672
|
|
2585
2673
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2586
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2587
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2674
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2675
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2588
2676
|
|
2589
2677
|
auto & layer = model.layers[i];
|
2590
2678
|
|
@@ -2620,12 +2708,12 @@ static void llm_load_tensors(
|
|
2620
2708
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2621
2709
|
// on Windows however this is detrimental unless everything is on the GPU
|
2622
2710
|
#ifndef _WIN32
|
2623
|
-
backend_norm =
|
2711
|
+
backend_norm = llama_backend_offload;
|
2624
2712
|
#else
|
2625
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2713
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2626
2714
|
#endif // _WIN32
|
2627
2715
|
|
2628
|
-
backend_output =
|
2716
|
+
backend_output = llama_backend_offload_split;
|
2629
2717
|
} else {
|
2630
2718
|
backend_norm = GGML_BACKEND_CPU;
|
2631
2719
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2649,8 +2737,8 @@ static void llm_load_tensors(
|
|
2649
2737
|
model.layers.resize(n_layer);
|
2650
2738
|
|
2651
2739
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2652
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2653
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2740
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2741
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2654
2742
|
|
2655
2743
|
auto & layer = model.layers[i];
|
2656
2744
|
|
@@ -2690,12 +2778,12 @@ static void llm_load_tensors(
|
|
2690
2778
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2691
2779
|
// on Windows however this is detrimental unless everything is on the GPU
|
2692
2780
|
#ifndef _WIN32
|
2693
|
-
backend_norm =
|
2781
|
+
backend_norm = llama_backend_offload;
|
2694
2782
|
#else
|
2695
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2783
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2696
2784
|
#endif // _WIN32
|
2697
2785
|
|
2698
|
-
backend_output =
|
2786
|
+
backend_output = llama_backend_offload_split;
|
2699
2787
|
} else {
|
2700
2788
|
backend_norm = GGML_BACKEND_CPU;
|
2701
2789
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2721,8 +2809,8 @@ static void llm_load_tensors(
|
|
2721
2809
|
model.layers.resize(n_layer);
|
2722
2810
|
|
2723
2811
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2724
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2725
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2812
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2813
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2726
2814
|
|
2727
2815
|
auto & layer = model.layers[i];
|
2728
2816
|
|
@@ -2767,12 +2855,12 @@ static void llm_load_tensors(
|
|
2767
2855
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2768
2856
|
// on Windows however this is detrimental unless everything is on the GPU
|
2769
2857
|
#ifndef _WIN32
|
2770
|
-
backend_norm =
|
2858
|
+
backend_norm = llama_backend_offload;
|
2771
2859
|
#else
|
2772
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2860
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2773
2861
|
#endif // _WIN32
|
2774
2862
|
|
2775
|
-
backend_output =
|
2863
|
+
backend_output = llama_backend_offload_split;
|
2776
2864
|
} else {
|
2777
2865
|
backend_norm = GGML_BACKEND_CPU;
|
2778
2866
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2798,8 +2886,8 @@ static void llm_load_tensors(
|
|
2798
2886
|
model.layers.resize(n_layer);
|
2799
2887
|
|
2800
2888
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2801
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2802
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2889
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
2890
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2803
2891
|
|
2804
2892
|
auto & layer = model.layers[i];
|
2805
2893
|
|
@@ -2841,15 +2929,22 @@ static void llm_load_tensors(
|
|
2841
2929
|
ggml_backend_type backend_output;
|
2842
2930
|
|
2843
2931
|
if (n_gpu_layers > int(n_layer)) {
|
2932
|
+
#ifdef GGML_USE_CUBLAS
|
2933
|
+
if (n_gpu_layers > int(n_layer + 1)) {
|
2934
|
+
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
2935
|
+
__func__, n_layer + 1);
|
2936
|
+
throw std::runtime_error("Persimmon CUDA offload failed");
|
2937
|
+
}
|
2938
|
+
#endif
|
2844
2939
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2845
2940
|
// on Windows however this is detrimental unless everything is on the GPU
|
2846
2941
|
#ifndef _WIN32
|
2847
|
-
backend_norm =
|
2942
|
+
backend_norm = llama_backend_offload;
|
2848
2943
|
#else
|
2849
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
2944
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2850
2945
|
#endif // _WIN32
|
2851
2946
|
|
2852
|
-
backend_output =
|
2947
|
+
backend_output = llama_backend_offload_split;
|
2853
2948
|
} else {
|
2854
2949
|
backend_norm = GGML_BACKEND_CPU;
|
2855
2950
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2872,8 +2967,8 @@ static void llm_load_tensors(
|
|
2872
2967
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
2873
2968
|
model.layers.resize(n_layer);
|
2874
2969
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2875
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2876
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2970
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
|
2971
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
|
2877
2972
|
auto & layer = model.layers[i];
|
2878
2973
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2879
2974
|
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
@@ -2910,12 +3005,12 @@ static void llm_load_tensors(
|
|
2910
3005
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2911
3006
|
// on Windows however this is detrimental unless everything is on the GPU
|
2912
3007
|
#ifndef _WIN32
|
2913
|
-
backend_norm =
|
3008
|
+
backend_norm = llama_backend_offload;
|
2914
3009
|
#else
|
2915
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
3010
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2916
3011
|
#endif // _WIN32
|
2917
3012
|
|
2918
|
-
backend_output =
|
3013
|
+
backend_output = llama_backend_offload_split;
|
2919
3014
|
} else {
|
2920
3015
|
backend_norm = GGML_BACKEND_CPU;
|
2921
3016
|
backend_output = GGML_BACKEND_CPU;
|
@@ -2941,8 +3036,8 @@ static void llm_load_tensors(
|
|
2941
3036
|
model.layers.resize(n_layer);
|
2942
3037
|
|
2943
3038
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
2944
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
2945
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3039
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3040
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
2946
3041
|
|
2947
3042
|
auto & layer = model.layers[i];
|
2948
3043
|
|
@@ -2988,12 +3083,12 @@ static void llm_load_tensors(
|
|
2988
3083
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2989
3084
|
// on Windows however this is detrimental unless everything is on the GPU
|
2990
3085
|
#ifndef _WIN32
|
2991
|
-
backend_norm =
|
3086
|
+
backend_norm = llama_backend_offload;
|
2992
3087
|
#else
|
2993
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU :
|
3088
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2994
3089
|
#endif // _WIN32
|
2995
3090
|
|
2996
|
-
backend_output =
|
3091
|
+
backend_output = llama_backend_offload_split;
|
2997
3092
|
} else {
|
2998
3093
|
backend_norm = GGML_BACKEND_CPU;
|
2999
3094
|
backend_output = GGML_BACKEND_CPU;
|
@@ -3017,8 +3112,8 @@ static void llm_load_tensors(
|
|
3017
3112
|
model.layers.resize(n_layer);
|
3018
3113
|
|
3019
3114
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
3020
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3021
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU :
|
3115
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3116
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3022
3117
|
|
3023
3118
|
auto & layer = model.layers[i];
|
3024
3119
|
|
@@ -3042,6 +3137,81 @@ static void llm_load_tensors(
|
|
3042
3137
|
}
|
3043
3138
|
}
|
3044
3139
|
} break;
|
3140
|
+
case LLM_ARCH_STABLELM:
|
3141
|
+
{
|
3142
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3143
|
+
|
3144
|
+
// output
|
3145
|
+
{
|
3146
|
+
ggml_backend_type backend_norm;
|
3147
|
+
ggml_backend_type backend_output;
|
3148
|
+
|
3149
|
+
if (n_gpu_layers > int(n_layer)) {
|
3150
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3151
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
3152
|
+
#ifndef _WIN32
|
3153
|
+
backend_norm = llama_backend_offload;
|
3154
|
+
#else
|
3155
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3156
|
+
#endif // _WIN32
|
3157
|
+
|
3158
|
+
backend_output = llama_backend_offload_split;
|
3159
|
+
} else {
|
3160
|
+
backend_norm = GGML_BACKEND_CPU;
|
3161
|
+
backend_output = GGML_BACKEND_CPU;
|
3162
|
+
}
|
3163
|
+
|
3164
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3165
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3166
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3167
|
+
|
3168
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3169
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3170
|
+
}
|
3171
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3172
|
+
vram_weights += ggml_nbytes(model.output);
|
3173
|
+
}
|
3174
|
+
}
|
3175
|
+
|
3176
|
+
const uint32_t n_ff = hparams.n_ff;
|
3177
|
+
|
3178
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3179
|
+
|
3180
|
+
model.layers.resize(n_layer);
|
3181
|
+
|
3182
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3183
|
+
/*
|
3184
|
+
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
|
3185
|
+
*/
|
3186
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3187
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3188
|
+
|
3189
|
+
auto & layer = model.layers[i];
|
3190
|
+
|
3191
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3192
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
3193
|
+
|
3194
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
3195
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3196
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
3197
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3198
|
+
|
3199
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3200
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
3201
|
+
|
3202
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3203
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3204
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3205
|
+
|
3206
|
+
if (backend == GGML_BACKEND_GPU) {
|
3207
|
+
vram_weights +=
|
3208
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3209
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
3210
|
+
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3211
|
+
}
|
3212
|
+
}
|
3213
|
+
} break;
|
3214
|
+
|
3045
3215
|
default:
|
3046
3216
|
throw std::runtime_error("unknown architecture");
|
3047
3217
|
}
|
@@ -3056,7 +3226,7 @@ static void llm_load_tensors(
|
|
3056
3226
|
ctx_size +
|
3057
3227
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
3058
3228
|
|
3059
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f
|
3229
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
3060
3230
|
|
3061
3231
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3062
3232
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -3075,7 +3245,7 @@ static void llm_load_tensors(
|
|
3075
3245
|
#endif // GGML_USE_CUBLAS
|
3076
3246
|
|
3077
3247
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3078
|
-
LLAMA_LOG_INFO("%s: VRAM used: %.2f
|
3248
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3079
3249
|
#else
|
3080
3250
|
(void) n_gpu_layers;
|
3081
3251
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -3575,7 +3745,7 @@ struct llm_build_context {
|
|
3575
3745
|
}
|
3576
3746
|
|
3577
3747
|
struct ggml_cgraph * build_llama() {
|
3578
|
-
struct ggml_cgraph * gf =
|
3748
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3579
3749
|
|
3580
3750
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3581
3751
|
|
@@ -3687,7 +3857,7 @@ struct llm_build_context {
|
|
3687
3857
|
}
|
3688
3858
|
|
3689
3859
|
struct ggml_cgraph * build_baichuan() {
|
3690
|
-
struct ggml_cgraph * gf =
|
3860
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3691
3861
|
|
3692
3862
|
struct ggml_tensor * cur;
|
3693
3863
|
struct ggml_tensor * inpL;
|
@@ -3807,7 +3977,7 @@ struct llm_build_context {
|
|
3807
3977
|
}
|
3808
3978
|
|
3809
3979
|
struct ggml_cgraph * build_falcon() {
|
3810
|
-
struct ggml_cgraph * gf =
|
3980
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3811
3981
|
|
3812
3982
|
struct ggml_tensor * cur;
|
3813
3983
|
struct ggml_tensor * inpL;
|
@@ -3929,7 +4099,7 @@ struct llm_build_context {
|
|
3929
4099
|
}
|
3930
4100
|
|
3931
4101
|
struct ggml_cgraph * build_starcoder() {
|
3932
|
-
struct ggml_cgraph * gf =
|
4102
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
3933
4103
|
|
3934
4104
|
struct ggml_tensor * cur;
|
3935
4105
|
struct ggml_tensor * pos;
|
@@ -4028,7 +4198,7 @@ struct llm_build_context {
|
|
4028
4198
|
}
|
4029
4199
|
|
4030
4200
|
struct ggml_cgraph * build_persimmon() {
|
4031
|
-
struct ggml_cgraph * gf =
|
4201
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4032
4202
|
|
4033
4203
|
const int64_t n_rot = n_embd_head / 2;
|
4034
4204
|
|
@@ -4173,7 +4343,7 @@ struct llm_build_context {
|
|
4173
4343
|
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4174
4344
|
cb(Kcur, "Kcur", il);
|
4175
4345
|
|
4176
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur,
|
4346
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4177
4347
|
cb(Q, "Q", il);
|
4178
4348
|
|
4179
4349
|
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
@@ -4238,7 +4408,7 @@ struct llm_build_context {
|
|
4238
4408
|
}
|
4239
4409
|
|
4240
4410
|
struct ggml_cgraph * build_refact() {
|
4241
|
-
struct ggml_cgraph * gf =
|
4411
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4242
4412
|
|
4243
4413
|
struct ggml_tensor * cur;
|
4244
4414
|
struct ggml_tensor * inpL;
|
@@ -4329,7 +4499,7 @@ struct llm_build_context {
|
|
4329
4499
|
}
|
4330
4500
|
|
4331
4501
|
struct ggml_cgraph * build_bloom() {
|
4332
|
-
struct ggml_cgraph * gf =
|
4502
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4333
4503
|
|
4334
4504
|
struct ggml_tensor * cur;
|
4335
4505
|
struct ggml_tensor * inpL;
|
@@ -4423,7 +4593,7 @@ struct llm_build_context {
|
|
4423
4593
|
}
|
4424
4594
|
|
4425
4595
|
struct ggml_cgraph * build_mpt() {
|
4426
|
-
struct ggml_cgraph * gf =
|
4596
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
4427
4597
|
|
4428
4598
|
struct ggml_tensor * cur;
|
4429
4599
|
struct ggml_tensor * inpL;
|
@@ -4520,6 +4690,177 @@ struct llm_build_context {
|
|
4520
4690
|
|
4521
4691
|
return gf;
|
4522
4692
|
}
|
4693
|
+
|
4694
|
+
struct ggml_cgraph * build_stablelm() {
|
4695
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
4696
|
+
|
4697
|
+
struct ggml_tensor * cur;
|
4698
|
+
struct ggml_tensor * inpL;
|
4699
|
+
|
4700
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4701
|
+
cb(inpL, "inp_embd", -1);
|
4702
|
+
|
4703
|
+
// inp_pos - contains the positions
|
4704
|
+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4705
|
+
cb(inp_pos, "inp_pos", -1);
|
4706
|
+
|
4707
|
+
// KQ_scale
|
4708
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4709
|
+
cb(KQ_scale, "KQ_scale", -1);
|
4710
|
+
|
4711
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4712
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4713
|
+
cb(KQ_mask, "KQ_mask", -1);
|
4714
|
+
|
4715
|
+
// shift the entire K-cache if needed
|
4716
|
+
if (do_rope_shift) {
|
4717
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
|
4718
|
+
}
|
4719
|
+
|
4720
|
+
for (int il = 0; il < n_layer; ++il) {
|
4721
|
+
struct ggml_tensor * inpSA = inpL;
|
4722
|
+
|
4723
|
+
// norm
|
4724
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
4725
|
+
model.layers[il].attn_norm,
|
4726
|
+
model.layers[il].attn_norm_b,
|
4727
|
+
LLM_NORM, cb, il);
|
4728
|
+
cb(cur, "attn_norm", il);
|
4729
|
+
|
4730
|
+
// self-attention
|
4731
|
+
{
|
4732
|
+
// compute Q and K and RoPE them
|
4733
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
4734
|
+
cb(tmpq, "tmpq", il);
|
4735
|
+
|
4736
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
4737
|
+
cb(tmpk, "tmpk", il);
|
4738
|
+
|
4739
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
4740
|
+
cb(Vcur, "Vcur", il);
|
4741
|
+
|
4742
|
+
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
4743
|
+
struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
|
4744
|
+
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
4745
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4746
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4747
|
+
0
|
4748
|
+
));
|
4749
|
+
cb(qrot, "qrot", il);
|
4750
|
+
|
4751
|
+
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
4752
|
+
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
4753
|
+
ggml_element_size(tmpk) * n_embd_head,
|
4754
|
+
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
4755
|
+
0
|
4756
|
+
));
|
4757
|
+
cb(krot, "krot", il);
|
4758
|
+
|
4759
|
+
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
4760
|
+
struct ggml_tensor * qpass = ggml_view_3d(
|
4761
|
+
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
4762
|
+
ggml_element_size(tmpq) * n_embd_head,
|
4763
|
+
ggml_element_size(tmpq) * n_embd_head * n_head,
|
4764
|
+
ggml_element_size(tmpq) * hparams.n_rot
|
4765
|
+
);
|
4766
|
+
cb(qpass, "qpass", il);
|
4767
|
+
|
4768
|
+
struct ggml_tensor * kpass = ggml_view_3d(
|
4769
|
+
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
4770
|
+
ggml_element_size(tmpk) * (n_embd_head),
|
4771
|
+
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
4772
|
+
ggml_element_size(tmpk) * hparams.n_rot
|
4773
|
+
);
|
4774
|
+
cb(kpass, "kpass", il);
|
4775
|
+
|
4776
|
+
struct ggml_tensor * qrotated = ggml_rope_custom(
|
4777
|
+
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4778
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4779
|
+
);
|
4780
|
+
cb(qrotated, "qrotated", il);
|
4781
|
+
|
4782
|
+
struct ggml_tensor * krotated = ggml_rope_custom(
|
4783
|
+
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4784
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4785
|
+
);
|
4786
|
+
cb(krotated, "krotated", il);
|
4787
|
+
|
4788
|
+
// ggml currently only supports concatenation on dim=2
|
4789
|
+
// so we need to permute qrot, qpass, concat, then permute back.
|
4790
|
+
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
4791
|
+
cb(qrotated, "qrotated", il);
|
4792
|
+
|
4793
|
+
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
4794
|
+
cb(krotated, "krotated", il);
|
4795
|
+
|
4796
|
+
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
4797
|
+
cb(qpass, "qpass", il);
|
4798
|
+
|
4799
|
+
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
4800
|
+
cb(kpass, "kpass", il);
|
4801
|
+
|
4802
|
+
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
4803
|
+
cb(Qcur, "Qcur", il);
|
4804
|
+
|
4805
|
+
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
4806
|
+
cb(Kcur, "Kcur", il);
|
4807
|
+
|
4808
|
+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
4809
|
+
cb(Q, "Q", il);
|
4810
|
+
|
4811
|
+
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
4812
|
+
cb(Kcur, "Kcur", il);
|
4813
|
+
|
4814
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
4815
|
+
|
4816
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
4817
|
+
model.layers[il].wo, NULL,
|
4818
|
+
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
4819
|
+
cb(cur, "kqv_out", il);
|
4820
|
+
}
|
4821
|
+
|
4822
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
4823
|
+
cb(ffn_inp, "ffn_inp", il);
|
4824
|
+
|
4825
|
+
// feed-forward network
|
4826
|
+
{
|
4827
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
4828
|
+
model.layers[il].ffn_norm,
|
4829
|
+
model.layers[il].ffn_norm_b,
|
4830
|
+
LLM_NORM, cb, il);
|
4831
|
+
cb(cur, "ffn_norm", il);
|
4832
|
+
|
4833
|
+
cur = llm_build_ffn(ctx0, cur,
|
4834
|
+
model.layers[il].ffn_up, NULL,
|
4835
|
+
model.layers[il].ffn_gate, NULL,
|
4836
|
+
model.layers[il].ffn_down, NULL,
|
4837
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
4838
|
+
cb(cur, "ffn_out", il);
|
4839
|
+
}
|
4840
|
+
|
4841
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
4842
|
+
cb(cur, "l_out", il);
|
4843
|
+
|
4844
|
+
// input for next layer
|
4845
|
+
inpL = cur;
|
4846
|
+
}
|
4847
|
+
|
4848
|
+
cur = inpL;
|
4849
|
+
|
4850
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
4851
|
+
model.output_norm,
|
4852
|
+
model.output_norm_b,
|
4853
|
+
LLM_NORM, cb, -1);
|
4854
|
+
cb(cur, "result_norm", -1);
|
4855
|
+
|
4856
|
+
// lm_head
|
4857
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
4858
|
+
cb(cur, "result_output", -1);
|
4859
|
+
|
4860
|
+
ggml_build_forward_expand(gf, cur);
|
4861
|
+
|
4862
|
+
return gf;
|
4863
|
+
}
|
4523
4864
|
};
|
4524
4865
|
|
4525
4866
|
//
|
@@ -4989,6 +5330,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
4989
5330
|
{
|
4990
5331
|
result = llm.build_mpt();
|
4991
5332
|
} break;
|
5333
|
+
case LLM_ARCH_STABLELM:
|
5334
|
+
{
|
5335
|
+
result = llm.build_stablelm();
|
5336
|
+
} break;
|
4992
5337
|
default:
|
4993
5338
|
GGML_ASSERT(false);
|
4994
5339
|
}
|
@@ -5159,11 +5504,13 @@ static int llama_decode_internal(
|
|
5159
5504
|
|
5160
5505
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
5161
5506
|
const bool full_offload_supported =
|
5162
|
-
model.arch == LLM_ARCH_LLAMA
|
5163
|
-
model.arch == LLM_ARCH_BAICHUAN
|
5164
|
-
model.arch == LLM_ARCH_FALCON
|
5165
|
-
model.arch == LLM_ARCH_REFACT
|
5166
|
-
model.arch == LLM_ARCH_MPT
|
5507
|
+
model.arch == LLM_ARCH_LLAMA ||
|
5508
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
5509
|
+
model.arch == LLM_ARCH_FALCON ||
|
5510
|
+
model.arch == LLM_ARCH_REFACT ||
|
5511
|
+
model.arch == LLM_ARCH_MPT ||
|
5512
|
+
model.arch == LLM_ARCH_STARCODER ||
|
5513
|
+
model.arch == LLM_ARCH_STABLELM;
|
5167
5514
|
|
5168
5515
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5169
5516
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
@@ -5955,7 +6302,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
5955
6302
|
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
5956
6303
|
// and passing 'add space prefix' as bool argument
|
5957
6304
|
//
|
5958
|
-
auto raw_text =
|
6305
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6306
|
+
if (&fragment == &fragment_buffer.front()) {
|
6307
|
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
6308
|
+
}
|
5959
6309
|
|
5960
6310
|
#ifdef PRETOKENIZERDEBUG
|
5961
6311
|
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
@@ -7607,7 +7957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7607
7957
|
workers.clear();
|
7608
7958
|
}
|
7609
7959
|
|
7610
|
-
LLAMA_LOG_INFO("size = %8.2f
|
7960
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
7611
7961
|
int64_t tot_count = 0;
|
7612
7962
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
7613
7963
|
hist_all[i] += hist_cur[i];
|
@@ -7977,7 +8327,7 @@ struct llama_context_params llama_context_default_params() {
|
|
7977
8327
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
7978
8328
|
/*.rope_freq_base =*/ 0.0f,
|
7979
8329
|
/*.rope_freq_scale =*/ 0.0f,
|
7980
|
-
/*.yarn_ext_factor =*/
|
8330
|
+
/*.yarn_ext_factor =*/ -1.0f,
|
7981
8331
|
/*.yarn_attn_factor =*/ 1.0f,
|
7982
8332
|
/*.yarn_beta_fast =*/ 32.0f,
|
7983
8333
|
/*.yarn_beta_slow =*/ 1.0f,
|
@@ -8120,7 +8470,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8120
8470
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
8121
8471
|
}
|
8122
8472
|
|
8123
|
-
if (
|
8473
|
+
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
8124
8474
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
8125
8475
|
}
|
8126
8476
|
|
@@ -8147,7 +8497,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8147
8497
|
|
8148
8498
|
{
|
8149
8499
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
8150
|
-
LLAMA_LOG_INFO("%s: kv self size = %7.2f
|
8500
|
+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
8151
8501
|
}
|
8152
8502
|
|
8153
8503
|
// resized during inference
|
@@ -8164,7 +8514,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8164
8514
|
{
|
8165
8515
|
static const size_t tensor_alignment = 32;
|
8166
8516
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
8167
|
-
ctx->buf_compute.resize(ggml_tensor_overhead()*
|
8517
|
+
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
8168
8518
|
|
8169
8519
|
// create measure allocator
|
8170
8520
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
@@ -8192,7 +8542,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8192
8542
|
// measure memory requirements for the graph
|
8193
8543
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
8194
8544
|
|
8195
|
-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f
|
8545
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
8196
8546
|
|
8197
8547
|
// recreate allocator with exact memory requirements
|
8198
8548
|
ggml_allocr_free(ctx->alloc);
|
@@ -8206,7 +8556,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8206
8556
|
#endif
|
8207
8557
|
#ifdef GGML_USE_CUBLAS
|
8208
8558
|
ggml_cuda_set_scratch_size(alloc_size);
|
8209
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f
|
8559
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
8210
8560
|
|
8211
8561
|
// calculate total VRAM usage
|
8212
8562
|
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
@@ -8226,10 +8576,10 @@ struct llama_context * llama_new_context_with_model(
|
|
8226
8576
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8227
8577
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
8228
8578
|
|
8229
|
-
LLAMA_LOG_INFO("%s: total VRAM used: %.2f
|
8579
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
8230
8580
|
total_vram_size / 1024.0 / 1024.0,
|
8231
8581
|
model_vram_size / 1024.0 / 1024.0,
|
8232
|
-
ctx_vram_size
|
8582
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
8233
8583
|
#endif
|
8234
8584
|
}
|
8235
8585
|
|
@@ -8250,7 +8600,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8250
8600
|
|
8251
8601
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
8252
8602
|
|
8253
|
-
LLAMA_LOG_INFO("%s: max tensor size = %8.2f
|
8603
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
8254
8604
|
|
8255
8605
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
8256
8606
|
if (!(result)) { \
|
@@ -8553,8 +8903,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8553
8903
|
if (kv_buf_size) {
|
8554
8904
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8555
8905
|
|
8556
|
-
ggml_context * cpy_ctx = ggml_init({
|
8557
|
-
ggml_cgraph gf
|
8906
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
8907
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8558
8908
|
|
8559
8909
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8560
8910
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
@@ -8572,9 +8922,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
8572
8922
|
kv_head, n_embd, n_layer,
|
8573
8923
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8574
8924
|
|
8575
|
-
ggml_build_forward_expand(
|
8576
|
-
ggml_build_forward_expand(
|
8577
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
8925
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
8926
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
8927
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8578
8928
|
|
8579
8929
|
ggml_free(cpy_ctx);
|
8580
8930
|
|
@@ -8681,8 +9031,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8681
9031
|
|
8682
9032
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
8683
9033
|
|
8684
|
-
ggml_context * cpy_ctx = ggml_init({
|
8685
|
-
ggml_cgraph gf
|
9034
|
+
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9035
|
+
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
8686
9036
|
|
8687
9037
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
8688
9038
|
kin3d->data = (void *) inp;
|
@@ -8700,9 +9050,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
8700
9050
|
kv_head, n_embd, n_layer,
|
8701
9051
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
8702
9052
|
|
8703
|
-
ggml_build_forward_expand(
|
8704
|
-
ggml_build_forward_expand(
|
8705
|
-
ggml_graph_compute_helper(ctx->work_buffer,
|
9053
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9054
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9055
|
+
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
8706
9056
|
|
8707
9057
|
ggml_free(cpy_ctx);
|
8708
9058
|
}
|
@@ -8957,6 +9307,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
8957
9307
|
return model->vocab.linefeed_id;
|
8958
9308
|
}
|
8959
9309
|
|
9310
|
+
int llama_add_bos_token(const struct llama_model * model) {
|
9311
|
+
return model->vocab.special_add_bos;
|
9312
|
+
}
|
9313
|
+
|
9314
|
+
int llama_add_eos_token(const struct llama_model * model) {
|
9315
|
+
return model->vocab.special_add_eos;
|
9316
|
+
}
|
9317
|
+
|
8960
9318
|
llama_token llama_token_prefix(const struct llama_model * model) {
|
8961
9319
|
return model->vocab.special_prefix_id;
|
8962
9320
|
}
|