cui-llama.rn 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +2 -2
- package/android/src/main/java/com/rnllama/LlamaContext.java +32 -7
- package/cpp/common.cpp +36 -1
- package/cpp/common.h +5 -1
- package/cpp/ggml-aarch64.c +2 -11
- package/cpp/ggml-alloc.h +1 -1
- package/cpp/ggml-backend-impl.h +151 -78
- package/cpp/{ggml-backend.c → ggml-backend.cpp} +565 -269
- package/cpp/ggml-backend.h +147 -62
- package/cpp/ggml-impl.h +15 -0
- package/cpp/ggml-metal.h +8 -9
- package/cpp/ggml-metal.m +2428 -2111
- package/cpp/ggml-quants.c +2 -2
- package/cpp/ggml-quants.h +0 -4
- package/cpp/ggml.c +799 -1121
- package/cpp/ggml.h +79 -72
- package/cpp/llama-vocab.cpp +189 -106
- package/cpp/llama-vocab.h +18 -9
- package/cpp/llama.cpp +736 -341
- package/cpp/llama.h +9 -4
- package/cpp/unicode-data.cpp +6 -4
- package/cpp/unicode-data.h +4 -4
- package/cpp/unicode.cpp +14 -7
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -12,9 +12,7 @@
|
|
12
12
|
# include "ggml-rpc.h"
|
13
13
|
#endif
|
14
14
|
|
15
|
-
#
|
16
|
-
# include "ggml-cuda.h"
|
17
|
-
#elif defined(LM_GGML_USE_VULKAN)
|
15
|
+
#if defined(LM_GGML_USE_VULKAN)
|
18
16
|
# include "ggml-vulkan.h"
|
19
17
|
#elif defined(LM_GGML_USE_SYCL)
|
20
18
|
# include "ggml-sycl.h"
|
@@ -24,14 +22,6 @@
|
|
24
22
|
# include "ggml-cann.h"
|
25
23
|
#endif
|
26
24
|
|
27
|
-
#ifdef LM_GGML_USE_BLAS
|
28
|
-
# include "ggml-blas.h"
|
29
|
-
#endif
|
30
|
-
|
31
|
-
#ifdef LM_GGML_USE_METAL
|
32
|
-
# include "ggml-metal.h"
|
33
|
-
#endif
|
34
|
-
|
35
25
|
// TODO: replace with ggml API call
|
36
26
|
#define QK_K 256
|
37
27
|
|
@@ -227,6 +217,7 @@ enum llm_arch {
|
|
227
217
|
LLM_ARCH_RWKV6,
|
228
218
|
LLM_ARCH_GRANITE,
|
229
219
|
LLM_ARCH_GRANITE_MOE,
|
220
|
+
LLM_ARCH_CHAMELEON,
|
230
221
|
LLM_ARCH_UNKNOWN,
|
231
222
|
};
|
232
223
|
|
@@ -279,6 +270,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
279
270
|
{ LLM_ARCH_RWKV6, "rwkv6" },
|
280
271
|
{ LLM_ARCH_GRANITE, "granite" },
|
281
272
|
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
273
|
+
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
282
274
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
283
275
|
};
|
284
276
|
|
@@ -315,6 +307,7 @@ enum llm_kv {
|
|
315
307
|
LLM_KV_DECODER_START_TOKEN_ID,
|
316
308
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
317
309
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
310
|
+
LLM_KV_SWIN_NORM,
|
318
311
|
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
319
312
|
LLM_KV_TIME_MIX_EXTRA_DIM,
|
320
313
|
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
@@ -422,6 +415,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
422
415
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
423
416
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
424
417
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
418
|
+
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
425
419
|
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
426
420
|
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
|
427
421
|
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
@@ -613,9 +607,11 @@ enum llm_tensor {
|
|
613
607
|
LLM_TENSOR_ENC_FFN_DOWN,
|
614
608
|
LLM_TENSOR_ENC_FFN_UP,
|
615
609
|
LLM_TENSOR_ENC_OUTPUT_NORM,
|
610
|
+
LLM_TENSOR_CLS,
|
611
|
+
LLM_TENSOR_CLS_OUT,
|
616
612
|
};
|
617
613
|
|
618
|
-
static const std::map<llm_arch, std::map<llm_tensor,
|
614
|
+
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
619
615
|
{
|
620
616
|
LLM_ARCH_LLAMA,
|
621
617
|
{
|
@@ -800,6 +796,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
800
796
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
801
797
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
802
798
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
799
|
+
{ LLM_TENSOR_CLS, "cls" },
|
800
|
+
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
803
801
|
},
|
804
802
|
},
|
805
803
|
{
|
@@ -835,6 +833,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
835
833
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
836
834
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
837
835
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
836
|
+
{ LLM_TENSOR_CLS, "cls" },
|
838
837
|
},
|
839
838
|
},
|
840
839
|
{
|
@@ -1510,6 +1509,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1510
1509
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1511
1510
|
},
|
1512
1511
|
},
|
1512
|
+
{
|
1513
|
+
LLM_ARCH_CHAMELEON,
|
1514
|
+
{
|
1515
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1516
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1517
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1518
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1519
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1520
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1521
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1522
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1523
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1524
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1525
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1526
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1527
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
1528
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
1529
|
+
},
|
1530
|
+
},
|
1513
1531
|
{
|
1514
1532
|
LLM_ARCH_UNKNOWN,
|
1515
1533
|
{
|
@@ -1549,32 +1567,32 @@ struct LLM_TN {
|
|
1549
1567
|
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
1550
1568
|
}
|
1551
1569
|
|
1552
|
-
std::string operator()(llm_tensor tensor, const
|
1570
|
+
std::string operator()(llm_tensor tensor, const char * suffix) const {
|
1553
1571
|
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
1554
1572
|
return "__missing__";
|
1555
1573
|
}
|
1556
|
-
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
1574
|
+
return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix;
|
1557
1575
|
}
|
1558
1576
|
|
1559
1577
|
std::string operator()(llm_tensor tensor, int bid) const {
|
1560
1578
|
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
1561
1579
|
return "__missing__";
|
1562
1580
|
}
|
1563
|
-
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor)
|
1581
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid);
|
1564
1582
|
}
|
1565
1583
|
|
1566
|
-
std::string operator()(llm_tensor tensor, const
|
1584
|
+
std::string operator()(llm_tensor tensor, const char * suffix, int bid) const {
|
1567
1585
|
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
1568
1586
|
return "__missing__";
|
1569
1587
|
}
|
1570
|
-
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor)
|
1588
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix;
|
1571
1589
|
}
|
1572
1590
|
|
1573
|
-
std::string operator()(llm_tensor tensor, const
|
1591
|
+
std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const {
|
1574
1592
|
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
1575
1593
|
return "__missing__";
|
1576
1594
|
}
|
1577
|
-
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor)
|
1595
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix;
|
1578
1596
|
}
|
1579
1597
|
};
|
1580
1598
|
|
@@ -2247,59 +2265,16 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
|
|
2247
2265
|
return piece;
|
2248
2266
|
}
|
2249
2267
|
|
2250
|
-
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
2251
|
-
lm_ggml_backend_buffer_type_t buft = nullptr;
|
2252
|
-
|
2253
|
-
#if defined(LM_GGML_USE_CUDA)
|
2254
|
-
// host buffers should only be used when data is expected to be copied to/from the GPU
|
2255
|
-
if (host_buffer) {
|
2256
|
-
buft = lm_ggml_backend_cuda_host_buffer_type();
|
2257
|
-
}
|
2258
|
-
#elif defined(LM_GGML_USE_SYCL)
|
2259
|
-
if (host_buffer) {
|
2260
|
-
buft = lm_ggml_backend_sycl_host_buffer_type();
|
2261
|
-
}
|
2262
|
-
#elif defined(LM_GGML_USE_CANN)
|
2263
|
-
if (host_buffer) {
|
2264
|
-
buft = lm_ggml_backend_cann_host_buffer_type();
|
2265
|
-
}
|
2266
|
-
#elif defined(LM_GGML_USE_CPU_HBM)
|
2267
|
-
buft = lm_ggml_backend_cpu_hbm_buffer_type();
|
2268
|
-
#elif defined(LM_GGML_USE_VULKAN)
|
2269
|
-
if (host_buffer) {
|
2270
|
-
buft = lm_ggml_backend_vk_host_buffer_type();
|
2271
|
-
}
|
2272
|
-
#endif
|
2273
|
-
|
2274
|
-
if (buft == nullptr) {
|
2275
|
-
buft = lm_ggml_backend_cpu_buffer_type();
|
2276
|
-
}
|
2277
|
-
return buft;
|
2278
|
-
|
2279
|
-
LM_GGML_UNUSED(host_buffer);
|
2280
|
-
}
|
2281
|
-
|
2282
2268
|
//
|
2283
2269
|
// globals
|
2284
2270
|
//
|
2285
2271
|
|
2286
|
-
struct
|
2287
|
-
llama_state() {
|
2288
|
-
#ifdef LM_GGML_USE_METAL
|
2289
|
-
lm_ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
2290
|
-
#elif defined(LM_GGML_USE_CUDA)
|
2291
|
-
lm_ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
2292
|
-
#elif defined(LM_GGML_USE_CANN)
|
2293
|
-
lm_ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
|
2294
|
-
#endif
|
2295
|
-
}
|
2296
|
-
|
2297
|
-
// We save the log callback globally
|
2272
|
+
struct llama_logger_state {
|
2298
2273
|
lm_ggml_log_callback log_callback = llama_log_callback_default;
|
2299
2274
|
void * log_callback_user_data = nullptr;
|
2300
2275
|
};
|
2301
2276
|
|
2302
|
-
static
|
2277
|
+
static llama_logger_state g_logger_state;
|
2303
2278
|
|
2304
2279
|
// available llama models
|
2305
2280
|
enum e_model {
|
@@ -2373,6 +2348,7 @@ struct llama_hparams {
|
|
2373
2348
|
bool vocab_only;
|
2374
2349
|
bool rope_finetuned;
|
2375
2350
|
bool use_par_res;
|
2351
|
+
bool swin_norm;
|
2376
2352
|
|
2377
2353
|
uint32_t n_vocab;
|
2378
2354
|
uint32_t n_ctx_train; // context size the model was trained on
|
@@ -2439,7 +2415,7 @@ struct llama_hparams {
|
|
2439
2415
|
|
2440
2416
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
2441
2417
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
2442
|
-
llama_token dec_start_token_id =
|
2418
|
+
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
2443
2419
|
|
2444
2420
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
2445
2421
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
@@ -2881,6 +2857,7 @@ struct llama_model {
|
|
2881
2857
|
llama_hparams hparams = {};
|
2882
2858
|
llama_vocab vocab;
|
2883
2859
|
|
2860
|
+
// TODO: should init all tensors to nullptr
|
2884
2861
|
struct lm_ggml_tensor * tok_embd;
|
2885
2862
|
struct lm_ggml_tensor * type_embd;
|
2886
2863
|
struct lm_ggml_tensor * pos_embd;
|
@@ -2893,16 +2870,25 @@ struct llama_model {
|
|
2893
2870
|
struct lm_ggml_tensor * output_b;
|
2894
2871
|
struct lm_ggml_tensor * output_norm_enc;
|
2895
2872
|
|
2873
|
+
// classifier
|
2874
|
+
struct lm_ggml_tensor * cls;
|
2875
|
+
struct lm_ggml_tensor * cls_b;
|
2876
|
+
struct lm_ggml_tensor * cls_out = nullptr;
|
2877
|
+
struct lm_ggml_tensor * cls_out_b = nullptr;
|
2878
|
+
|
2896
2879
|
std::vector<llama_layer> layers;
|
2897
2880
|
|
2881
|
+
// gguf metadata
|
2882
|
+
std::unordered_map<std::string, std::string> lm_gguf_kv;
|
2883
|
+
|
2898
2884
|
llama_split_mode split_mode;
|
2899
2885
|
int main_gpu;
|
2900
2886
|
int n_gpu_layers;
|
2901
2887
|
|
2902
|
-
|
2888
|
+
// list of devices used in this model
|
2889
|
+
std::vector<lm_ggml_backend_dev_t> devices;
|
2903
2890
|
|
2904
|
-
|
2905
|
-
std::unordered_map<std::string, std::string> lm_gguf_kv;
|
2891
|
+
std::vector<std::string> rpc_servers;
|
2906
2892
|
|
2907
2893
|
// layer -> buffer type mapping
|
2908
2894
|
struct layer_buft {
|
@@ -2945,11 +2931,6 @@ struct llama_model {
|
|
2945
2931
|
lm_ggml_free(ctx);
|
2946
2932
|
}
|
2947
2933
|
for (lm_ggml_backend_buffer_t buf : bufs) {
|
2948
|
-
#ifdef LM_GGML_USE_CUDA
|
2949
|
-
if (lm_ggml_backend_buffer_get_type(buf) == lm_ggml_backend_cpu_buffer_type()) {
|
2950
|
-
lm_ggml_backend_cuda_unregister_host_buffer(lm_ggml_backend_buffer_get_base(buf));
|
2951
|
-
}
|
2952
|
-
#endif
|
2953
2934
|
lm_ggml_backend_buffer_free(buf);
|
2954
2935
|
}
|
2955
2936
|
while (!lora_adapters.empty()) {
|
@@ -3314,12 +3295,8 @@ struct llama_context {
|
|
3314
3295
|
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
|
3315
3296
|
|
3316
3297
|
std::vector<lm_ggml_backend_t> backends;
|
3317
|
-
|
3318
|
-
|
3319
|
-
#endif
|
3320
|
-
#ifdef LM_GGML_USE_BLAS
|
3321
|
-
lm_ggml_backend_t backend_blas = nullptr;
|
3322
|
-
#endif
|
3298
|
+
std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
3299
|
+
|
3323
3300
|
lm_ggml_backend_t backend_cpu = nullptr;
|
3324
3301
|
|
3325
3302
|
lm_ggml_threadpool_t threadpool = nullptr;
|
@@ -3435,72 +3412,112 @@ struct llama_lora_adapter {
|
|
3435
3412
|
}
|
3436
3413
|
};
|
3437
3414
|
|
3438
|
-
static
|
3439
|
-
|
3440
|
-
|
3441
|
-
|
3442
|
-
|
3443
|
-
|
3415
|
+
static int llama_get_device_count(const llama_model & model) {
|
3416
|
+
int count = (int) model.devices.size();
|
3417
|
+
|
3418
|
+
#if defined(LM_GGML_USE_RPC)
|
3419
|
+
count += (int) model.rpc_servers.size();
|
3420
|
+
#endif
|
3421
|
+
|
3422
|
+
#if defined(LM_GGML_USE_SYCL)
|
3423
|
+
count += lm_ggml_backend_sycl_get_device_count();
|
3444
3424
|
#elif defined(LM_GGML_USE_VULKAN)
|
3445
|
-
count
|
3425
|
+
count += lm_ggml_backend_vk_get_device_count();
|
3446
3426
|
#elif defined(LM_GGML_USE_CANN)
|
3447
|
-
|
3448
|
-
#endif
|
3449
|
-
#if defined(LM_GGML_USE_RPC)
|
3450
|
-
count += model.rpc_servers.size();
|
3427
|
+
count += lm_ggml_backend_cann_get_device_count();
|
3451
3428
|
#endif
|
3429
|
+
|
3452
3430
|
return count;
|
3431
|
+
|
3453
3432
|
LM_GGML_UNUSED(model);
|
3454
3433
|
}
|
3455
3434
|
|
3456
|
-
static lm_ggml_backend_buffer_type_t
|
3435
|
+
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) {
|
3457
3436
|
lm_ggml_backend_buffer_type_t buft = nullptr;
|
3458
3437
|
|
3459
|
-
|
3460
|
-
|
3461
|
-
|
3462
|
-
|
3438
|
+
if (host_buffer) {
|
3439
|
+
for (auto * dev : model.devices) {
|
3440
|
+
buft = lm_ggml_backend_dev_host_buffer_type(dev);
|
3441
|
+
if (buft != nullptr) {
|
3442
|
+
break;
|
3443
|
+
}
|
3444
|
+
}
|
3445
|
+
}
|
3446
|
+
|
3447
|
+
#if defined(LM_GGML_USE_SYCL)
|
3448
|
+
if (host_buffer) {
|
3449
|
+
buft = lm_ggml_backend_sycl_host_buffer_type();
|
3450
|
+
}
|
3451
|
+
#elif defined(LM_GGML_USE_CANN)
|
3452
|
+
if (host_buffer) {
|
3453
|
+
buft = lm_ggml_backend_cann_host_buffer_type();
|
3454
|
+
}
|
3455
|
+
#elif defined(LM_GGML_USE_CPU_HBM)
|
3456
|
+
buft = lm_ggml_backend_cpu_hbm_buffer_type();
|
3457
|
+
#elif defined(LM_GGML_USE_VULKAN)
|
3458
|
+
if (host_buffer) {
|
3459
|
+
buft = lm_ggml_backend_vk_host_buffer_type();
|
3460
|
+
}
|
3463
3461
|
#endif
|
3464
|
-
|
3462
|
+
|
3463
|
+
if (buft == nullptr) {
|
3464
|
+
buft = lm_ggml_backend_cpu_buffer_type();
|
3465
|
+
}
|
3466
|
+
return buft;
|
3467
|
+
|
3468
|
+
LM_GGML_UNUSED(host_buffer);
|
3469
|
+
}
|
3470
|
+
|
3471
|
+
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
|
3472
|
+
lm_ggml_backend_buffer_type_t buft = nullptr;
|
3473
|
+
|
3465
3474
|
#if defined(LM_GGML_USE_RPC)
|
3466
|
-
|
3467
|
-
|
3475
|
+
int rpc_count = (int)model.rpc_servers.size();
|
3476
|
+
if (device < rpc_count) {
|
3477
|
+
const char * endpoint = model.rpc_servers[device].c_str();
|
3468
3478
|
return lm_ggml_backend_rpc_buffer_type(endpoint);
|
3469
3479
|
}
|
3480
|
+
device -= rpc_count;
|
3470
3481
|
#endif
|
3471
|
-
|
3472
|
-
|
3473
|
-
|
3474
|
-
|
3475
|
-
|
3476
|
-
|
3482
|
+
|
3483
|
+
if (device < (int)model.devices.size()) {
|
3484
|
+
return lm_ggml_backend_dev_buffer_type(model.devices[device]);
|
3485
|
+
}
|
3486
|
+
device -= (int)model.devices.size();
|
3487
|
+
|
3488
|
+
#if defined(LM_GGML_USE_VULKAN)
|
3489
|
+
buft = lm_ggml_backend_vk_buffer_type(device);
|
3477
3490
|
#elif defined(LM_GGML_USE_SYCL)
|
3478
|
-
buft = lm_ggml_backend_sycl_buffer_type(
|
3491
|
+
buft = lm_ggml_backend_sycl_buffer_type(device);
|
3479
3492
|
#elif defined(LM_GGML_USE_KOMPUTE)
|
3480
|
-
buft = lm_ggml_backend_kompute_buffer_type(
|
3481
|
-
if (buft == nullptr) {
|
3482
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
|
3483
|
-
}
|
3493
|
+
buft = lm_ggml_backend_kompute_buffer_type(device);
|
3484
3494
|
#elif defined(LM_GGML_USE_CANN)
|
3485
|
-
buft = lm_ggml_backend_cann_buffer_type(
|
3495
|
+
buft = lm_ggml_backend_cann_buffer_type(device);
|
3486
3496
|
#endif
|
3487
3497
|
|
3488
3498
|
if (buft == nullptr) {
|
3489
|
-
buft = llama_default_buffer_type_cpu(true);
|
3499
|
+
buft = llama_default_buffer_type_cpu(model, true);
|
3490
3500
|
}
|
3491
3501
|
return buft;
|
3502
|
+
|
3492
3503
|
LM_GGML_UNUSED(model);
|
3493
|
-
LM_GGML_UNUSED(local_gpu);
|
3494
3504
|
}
|
3495
3505
|
|
3496
3506
|
static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
3497
3507
|
lm_ggml_backend_buffer_type_t buft = nullptr;
|
3498
3508
|
|
3499
|
-
|
3500
|
-
|
3501
|
-
|
3509
|
+
// find a backend that supports split buffers
|
3510
|
+
for (size_t i = 0; i < lm_ggml_backend_reg_count(); ++i) {
|
3511
|
+
lm_ggml_backend_reg_t reg = lm_ggml_backend_reg_get(i);
|
3512
|
+
|
3513
|
+
auto lm_ggml_backend_split_buffer_type_fn = (lm_ggml_backend_split_buffer_type_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_split_buffer_type");
|
3514
|
+
if (lm_ggml_backend_split_buffer_type_fn) {
|
3515
|
+
buft = lm_ggml_backend_split_buffer_type_fn(tensor_split);
|
3516
|
+
if (buft != nullptr) {
|
3517
|
+
break;
|
3518
|
+
}
|
3519
|
+
}
|
3502
3520
|
}
|
3503
|
-
#endif
|
3504
3521
|
|
3505
3522
|
#ifdef LM_GGML_USE_SYCL
|
3506
3523
|
if (lm_ggml_backend_sycl_get_device_count() > 1) {
|
@@ -3517,13 +3534,8 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
|
|
3517
3534
|
}
|
3518
3535
|
|
3519
3536
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
3520
|
-
#ifdef LM_GGML_USE_RPC
|
3521
|
-
int rpc_count = (int)model.rpc_servers.size();
|
3522
|
-
#else
|
3523
|
-
int rpc_count = 0;
|
3524
|
-
#endif
|
3525
|
-
int local_device = device - rpc_count;
|
3526
3537
|
#if defined(LM_GGML_USE_RPC)
|
3538
|
+
int rpc_count = (int)model.rpc_servers.size();
|
3527
3539
|
if (device < rpc_count) {
|
3528
3540
|
size_t total;
|
3529
3541
|
size_t free;
|
@@ -3531,32 +3543,37 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
3531
3543
|
lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
3532
3544
|
return free;
|
3533
3545
|
}
|
3546
|
+
device = device - rpc_count;
|
3534
3547
|
#endif
|
3535
|
-
|
3536
|
-
|
3537
|
-
|
3538
|
-
|
3539
|
-
|
3540
|
-
|
3548
|
+
|
3549
|
+
if (device < (int)model.devices.size()) {
|
3550
|
+
lm_ggml_backend_dev_t dev = model.devices[device];
|
3551
|
+
size_t total;
|
3552
|
+
size_t free;
|
3553
|
+
lm_ggml_backend_dev_memory(dev, &free, &total);
|
3554
|
+
return free;
|
3555
|
+
}
|
3556
|
+
|
3557
|
+
#if defined(LM_GGML_USE_SYCL)
|
3541
3558
|
size_t total;
|
3542
3559
|
size_t free;
|
3543
|
-
lm_ggml_backend_sycl_get_device_memory(
|
3560
|
+
lm_ggml_backend_sycl_get_device_memory(device, &free, &total);
|
3544
3561
|
return free;
|
3545
3562
|
#elif defined(LM_GGML_USE_VULKAN)
|
3546
3563
|
size_t total;
|
3547
3564
|
size_t free;
|
3548
|
-
lm_ggml_backend_vk_get_device_memory(
|
3565
|
+
lm_ggml_backend_vk_get_device_memory(device, &free, &total);
|
3549
3566
|
return free;
|
3550
3567
|
#elif defined(LM_GGML_USE_CANN)
|
3551
3568
|
size_t total;
|
3552
3569
|
size_t free;
|
3553
|
-
lm_ggml_backend_cann_get_device_memory(
|
3570
|
+
lm_ggml_backend_cann_get_device_memory(device, &free, &total);
|
3554
3571
|
return free;
|
3555
3572
|
#else
|
3556
3573
|
return 1;
|
3557
3574
|
#endif
|
3558
3575
|
LM_GGML_UNUSED(model);
|
3559
|
-
LM_GGML_UNUSED(
|
3576
|
+
LM_GGML_UNUSED(device);
|
3560
3577
|
}
|
3561
3578
|
|
3562
3579
|
//
|
@@ -3599,7 +3616,7 @@ static bool llama_kv_cache_init(
|
|
3599
3616
|
buft_layer_count[model.buft_layer[i].buft]++;
|
3600
3617
|
}
|
3601
3618
|
} else {
|
3602
|
-
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
|
3619
|
+
buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer;
|
3603
3620
|
}
|
3604
3621
|
|
3605
3622
|
// create a context for each buffer type
|
@@ -4891,7 +4908,7 @@ struct llama_model_loader {
|
|
4891
4908
|
static const int TENSOR_NOT_REQUIRED = 1;
|
4892
4909
|
static const int TENSOR_DUPLICATED = 2;
|
4893
4910
|
|
4894
|
-
struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::
|
4911
|
+
struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0) {
|
4895
4912
|
const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
4896
4913
|
|
4897
4914
|
if (cur == NULL) {
|
@@ -4901,7 +4918,7 @@ struct llama_model_loader {
|
|
4901
4918
|
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
4902
4919
|
}
|
4903
4920
|
|
4904
|
-
struct lm_ggml_tensor * create_tensor_as_view(struct lm_ggml_context * ctx, struct lm_ggml_tensor * base, const std::string & name, const std::
|
4921
|
+
struct lm_ggml_tensor * create_tensor_as_view(struct lm_ggml_context * ctx, struct lm_ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true) {
|
4905
4922
|
const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
4906
4923
|
|
4907
4924
|
if (cur == NULL) {
|
@@ -4914,7 +4931,7 @@ struct llama_model_loader {
|
|
4914
4931
|
|
4915
4932
|
std::array<int64_t, LM_GGML_MAX_DIMS> dims;
|
4916
4933
|
for (size_t i = 0; i < LM_GGML_MAX_DIMS; ++i) {
|
4917
|
-
dims[i] = i < ne.size() ? ne[i] : 1;
|
4934
|
+
dims[i] = i < ne.size() ? ne.begin()[i] : 1;
|
4918
4935
|
}
|
4919
4936
|
|
4920
4937
|
struct lm_ggml_tensor * tensor = lm_ggml_view_4d(ctx, base,
|
@@ -5012,7 +5029,7 @@ struct llama_model_loader {
|
|
5012
5029
|
// Returns false if cancelled by progress_callback
|
5013
5030
|
bool load_all_data(
|
5014
5031
|
struct lm_ggml_context * ctx,
|
5015
|
-
llama_buf_map &
|
5032
|
+
llama_buf_map & bufs,
|
5016
5033
|
llama_mlocks * lmlocks,
|
5017
5034
|
llama_progress_callback progress_callback,
|
5018
5035
|
void * progress_callback_user_data) {
|
@@ -5021,43 +5038,94 @@ struct llama_model_loader {
|
|
5021
5038
|
std::vector<no_init<uint8_t>> read_buf;
|
5022
5039
|
std::vector<std::future<std::pair<lm_ggml_tensor *, bool>>> validation_result;
|
5023
5040
|
|
5024
|
-
#if defined(LM_GGML_USE_CUDA)
|
5025
5041
|
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
5026
5042
|
// NVMe raid configurations might require more / larger buffers.
|
5027
5043
|
constexpr size_t n_buffers = 4;
|
5028
5044
|
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
5029
5045
|
|
5030
5046
|
std::vector<lm_ggml_backend_buffer_t> host_buffers;
|
5031
|
-
std::vector<void*> host_ptrs;
|
5032
5047
|
std::vector<lm_ggml_backend_event_t> events;
|
5048
|
+
std::vector<void *> host_ptrs;
|
5033
5049
|
size_t buffer_idx = 0; // buffer to use for async loads
|
5034
|
-
|
5035
|
-
|
5036
|
-
|
5050
|
+
lm_ggml_backend_t upload_backend = [&](const char * fn) -> lm_ggml_backend_t {
|
5051
|
+
if (use_mmap || check_tensors) {
|
5052
|
+
return nullptr;
|
5053
|
+
}
|
5037
5054
|
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
5038
|
-
// First determine if the
|
5039
|
-
|
5040
|
-
if (buf) {
|
5041
|
-
|
5042
|
-
|
5043
|
-
|
5044
|
-
|
5045
|
-
|
5046
|
-
|
5047
|
-
|
5048
|
-
|
5055
|
+
// First determine if the backend supports the necessary features for async uploads.
|
5056
|
+
auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
|
5057
|
+
if (!buf) {
|
5058
|
+
LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
|
5059
|
+
return nullptr;
|
5060
|
+
}
|
5061
|
+
|
5062
|
+
auto * buft = lm_ggml_backend_buffer_get_type(buf);
|
5063
|
+
auto * dev = lm_ggml_backend_buft_get_device(buft);
|
5064
|
+
if (!dev) {
|
5065
|
+
LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
|
5066
|
+
lm_ggml_backend_buft_name(buft));
|
5067
|
+
return nullptr;
|
5068
|
+
}
|
5069
|
+
|
5070
|
+
if (buft != lm_ggml_backend_dev_buffer_type(dev)) {
|
5071
|
+
LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
|
5072
|
+
lm_ggml_backend_buft_name(buft), lm_ggml_backend_dev_name(dev));
|
5073
|
+
return nullptr;
|
5074
|
+
}
|
5075
|
+
|
5076
|
+
lm_ggml_backend_dev_props props;
|
5077
|
+
lm_ggml_backend_dev_get_props(dev, &props);
|
5078
|
+
if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
|
5079
|
+
LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
|
5080
|
+
lm_ggml_backend_dev_name(dev));
|
5081
|
+
return nullptr;
|
5049
5082
|
}
|
5050
5083
|
|
5051
|
-
|
5052
|
-
if (
|
5053
|
-
|
5054
|
-
|
5055
|
-
|
5056
|
-
|
5084
|
+
auto * host_buft = lm_ggml_backend_dev_host_buffer_type(dev);
|
5085
|
+
if (!host_buft) {
|
5086
|
+
LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
|
5087
|
+
lm_ggml_backend_dev_name(dev));
|
5088
|
+
return nullptr;
|
5089
|
+
}
|
5090
|
+
|
5091
|
+
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
5092
|
+
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
5093
|
+
auto * buf = lm_ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
5094
|
+
if (!buf) {
|
5095
|
+
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
|
5096
|
+
lm_ggml_backend_dev_name(dev));
|
5097
|
+
return nullptr;
|
5098
|
+
}
|
5099
|
+
|
5100
|
+
host_buffers.emplace_back(buf);
|
5101
|
+
host_ptrs.emplace_back(lm_ggml_backend_buffer_get_base(buf));
|
5102
|
+
|
5103
|
+
auto * event = lm_ggml_backend_event_new(dev);
|
5104
|
+
if (!event) {
|
5105
|
+
LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
|
5106
|
+
lm_ggml_backend_dev_name(dev));
|
5107
|
+
return nullptr;
|
5057
5108
|
}
|
5109
|
+
|
5110
|
+
events.emplace_back(event);
|
5111
|
+
}
|
5112
|
+
|
5113
|
+
lm_ggml_backend_t backend = lm_ggml_backend_dev_init(dev, nullptr);
|
5114
|
+
if (!backend) {
|
5115
|
+
LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
|
5116
|
+
lm_ggml_backend_dev_name(dev));
|
5117
|
+
return nullptr;
|
5058
5118
|
}
|
5119
|
+
|
5120
|
+
return backend;
|
5121
|
+
}(__func__);
|
5122
|
+
|
5123
|
+
if (upload_backend) {
|
5124
|
+
LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
|
5125
|
+
lm_ggml_backend_dev_name(lm_ggml_backend_get_device(upload_backend)),
|
5126
|
+
lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(bufs.at(0))),
|
5127
|
+
lm_ggml_backend_name(upload_backend));
|
5059
5128
|
}
|
5060
|
-
#endif
|
5061
5129
|
|
5062
5130
|
for (struct lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur != NULL; cur = lm_ggml_get_next_tensor(ctx, cur)) {
|
5063
5131
|
const auto * weight = get_weight(lm_ggml_get_name(cur));
|
@@ -5077,8 +5145,8 @@ struct llama_model_loader {
|
|
5077
5145
|
if (use_mmap) {
|
5078
5146
|
const auto & mapping = mappings.at(weight->idx);
|
5079
5147
|
lm_ggml_backend_buffer_t buf_mmap = nullptr;
|
5080
|
-
if (
|
5081
|
-
buf_mmap =
|
5148
|
+
if (bufs.count(weight->idx)) {
|
5149
|
+
buf_mmap = bufs.at(weight->idx);
|
5082
5150
|
}
|
5083
5151
|
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
5084
5152
|
|
@@ -5114,9 +5182,8 @@ struct llama_model_loader {
|
|
5114
5182
|
}));
|
5115
5183
|
}
|
5116
5184
|
} else {
|
5117
|
-
|
5118
|
-
|
5119
|
-
if (cuda_backend) {
|
5185
|
+
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
5186
|
+
if (upload_backend) {
|
5120
5187
|
file->seek(weight->offs, SEEK_SET);
|
5121
5188
|
|
5122
5189
|
size_t bytes_read = 0;
|
@@ -5126,17 +5193,14 @@ struct llama_model_loader {
|
|
5126
5193
|
|
5127
5194
|
lm_ggml_backend_event_synchronize(events[buffer_idx]);
|
5128
5195
|
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
5129
|
-
lm_ggml_backend_tensor_set_async(
|
5130
|
-
lm_ggml_backend_event_record(events[buffer_idx]);
|
5196
|
+
lm_ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
5197
|
+
lm_ggml_backend_event_record(events[buffer_idx], upload_backend);
|
5131
5198
|
|
5132
5199
|
bytes_read += read_iteration;
|
5133
5200
|
++buffer_idx;
|
5134
5201
|
buffer_idx %= n_buffers;
|
5135
5202
|
}
|
5136
|
-
}
|
5137
|
-
else
|
5138
|
-
#endif
|
5139
|
-
{
|
5203
|
+
} else {
|
5140
5204
|
read_buf.resize(n_size);
|
5141
5205
|
file->seek(weight->offs, SEEK_SET);
|
5142
5206
|
file->read_raw(read_buf.data(), n_size);
|
@@ -5151,17 +5215,15 @@ struct llama_model_loader {
|
|
5151
5215
|
size_done += n_size;
|
5152
5216
|
}
|
5153
5217
|
|
5154
|
-
|
5155
|
-
|
5156
|
-
|
5157
|
-
|
5158
|
-
lm_ggml_backend_event_synchronize(events[idx]);
|
5159
|
-
lm_ggml_backend_event_free(events[idx]);
|
5160
|
-
lm_ggml_backend_buffer_free(host_buffers[idx]);
|
5161
|
-
}
|
5162
|
-
lm_ggml_backend_free(cuda_backend);
|
5218
|
+
// free temporary resources used for async uploads
|
5219
|
+
for (auto * event : events) {
|
5220
|
+
lm_ggml_backend_event_synchronize(event);
|
5221
|
+
lm_ggml_backend_event_free(event);
|
5163
5222
|
}
|
5164
|
-
|
5223
|
+
for (auto * buf : host_buffers) {
|
5224
|
+
lm_ggml_backend_buffer_free(buf);
|
5225
|
+
}
|
5226
|
+
lm_ggml_backend_free(upload_backend);
|
5165
5227
|
|
5166
5228
|
// check validation results
|
5167
5229
|
bool validation_failed = false;
|
@@ -5477,8 +5539,10 @@ static void llm_load_hparams(
|
|
5477
5539
|
}
|
5478
5540
|
} else {
|
5479
5541
|
switch (hparams.n_layer) {
|
5542
|
+
case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
|
5480
5543
|
case 22: model.type = e_model::MODEL_1B; break;
|
5481
5544
|
case 26: model.type = e_model::MODEL_3B; break;
|
5545
|
+
case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
|
5482
5546
|
// granite uses a vocab with len 49152
|
5483
5547
|
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
5484
5548
|
case 36: model.type = e_model::MODEL_8B; break; // granite
|
@@ -5591,11 +5655,11 @@ static void llm_load_hparams(
|
|
5591
5655
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
5592
5656
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
5593
5657
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
5594
|
-
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
5658
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
5595
5659
|
hparams.f_max_alibi_bias = 8.0f;
|
5596
5660
|
|
5597
5661
|
switch (hparams.n_layer) {
|
5598
|
-
case 4:
|
5662
|
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
5599
5663
|
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
5600
5664
|
}
|
5601
5665
|
} break;
|
@@ -6095,6 +6159,18 @@ static void llm_load_hparams(
|
|
6095
6159
|
default: model.type = e_model::MODEL_UNKNOWN;
|
6096
6160
|
}
|
6097
6161
|
} break;
|
6162
|
+
case LLM_ARCH_CHAMELEON:
|
6163
|
+
{
|
6164
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
6165
|
+
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
|
6166
|
+
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
6167
|
+
|
6168
|
+
switch (hparams.n_layer) {
|
6169
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
6170
|
+
case 48: model.type = e_model::MODEL_34B; break;
|
6171
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
6172
|
+
}
|
6173
|
+
} break;
|
6098
6174
|
default: (void)0;
|
6099
6175
|
}
|
6100
6176
|
|
@@ -6288,6 +6364,7 @@ static void llm_load_vocab(
|
|
6288
6364
|
tokenizer_pre == "phi-2" ||
|
6289
6365
|
tokenizer_pre == "jina-es" ||
|
6290
6366
|
tokenizer_pre == "jina-de" ||
|
6367
|
+
tokenizer_pre == "jina-v1-en" ||
|
6291
6368
|
tokenizer_pre == "jina-v2-es" ||
|
6292
6369
|
tokenizer_pre == "jina-v2-de" ||
|
6293
6370
|
tokenizer_pre == "jina-v2-code") {
|
@@ -6352,6 +6429,11 @@ static void llm_load_vocab(
|
|
6352
6429
|
} else if (
|
6353
6430
|
tokenizer_pre == "exaone") {
|
6354
6431
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
6432
|
+
} else if (
|
6433
|
+
tokenizer_pre == "chameleon") {
|
6434
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
6435
|
+
vocab.tokenizer_add_bos = true;
|
6436
|
+
vocab.tokenizer_clean_spaces = false;
|
6355
6437
|
} else {
|
6356
6438
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
6357
6439
|
}
|
@@ -6409,7 +6491,12 @@ static void llm_load_vocab(
|
|
6409
6491
|
|
6410
6492
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
6411
6493
|
std::string word = lm_gguf_get_arr_str(ctx, token_idx, i);
|
6412
|
-
|
6494
|
+
|
6495
|
+
//LM_GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
6496
|
+
if (word.empty()) {
|
6497
|
+
LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
|
6498
|
+
word = "[EMPTY_" + std::to_string(i) + "]";
|
6499
|
+
}
|
6413
6500
|
|
6414
6501
|
vocab.token_to_id[word] = i;
|
6415
6502
|
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
@@ -6434,6 +6521,8 @@ static void llm_load_vocab(
|
|
6434
6521
|
}
|
6435
6522
|
LM_GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
6436
6523
|
|
6524
|
+
vocab.init_tokenizer();
|
6525
|
+
|
6437
6526
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
6438
6527
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
6439
6528
|
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
@@ -6488,8 +6577,14 @@ static void llm_load_vocab(
|
|
6488
6577
|
vocab.linefeed_id = ids[0];
|
6489
6578
|
} else {
|
6490
6579
|
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
6491
|
-
|
6492
|
-
|
6580
|
+
|
6581
|
+
//LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
6582
|
+
if (ids.empty()) {
|
6583
|
+
LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
|
6584
|
+
vocab.linefeed_id = vocab.special_pad_id;
|
6585
|
+
} else {
|
6586
|
+
vocab.linefeed_id = ids[0];
|
6587
|
+
}
|
6493
6588
|
}
|
6494
6589
|
|
6495
6590
|
// special tokens
|
@@ -6864,6 +6959,13 @@ static bool llm_load_tensors(
|
|
6864
6959
|
void * progress_callback_user_data) {
|
6865
6960
|
auto & hparams = model.hparams;
|
6866
6961
|
|
6962
|
+
// check if the value of main_gpu is valid
|
6963
|
+
if (llama_get_device_count(model) > 0 &&
|
6964
|
+
split_mode != LLAMA_SPLIT_MODE_LAYER &&
|
6965
|
+
(main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
|
6966
|
+
throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
|
6967
|
+
}
|
6968
|
+
|
6867
6969
|
model.split_mode = split_mode;
|
6868
6970
|
model.main_gpu = main_gpu;
|
6869
6971
|
model.n_gpu_layers = n_gpu_layers;
|
@@ -6873,14 +6975,14 @@ static bool llm_load_tensors(
|
|
6873
6975
|
bool use_mmap_buffer = true;
|
6874
6976
|
|
6875
6977
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
6876
|
-
model.buft_input = llama_default_buffer_type_cpu(true);
|
6978
|
+
model.buft_input = llama_default_buffer_type_cpu(model, true);
|
6877
6979
|
//model.buft_input = llama_default_buffer_type_offload(main_gpu);
|
6878
6980
|
|
6879
6981
|
model.buft_layer.resize(n_layer);
|
6880
6982
|
|
6881
6983
|
// assign cpu layers
|
6882
6984
|
for (int i = 0; i < i_gpu_start; ++i) {
|
6883
|
-
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
6985
|
+
model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
|
6884
6986
|
}
|
6885
6987
|
|
6886
6988
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
@@ -6918,7 +7020,7 @@ static bool llm_load_tensors(
|
|
6918
7020
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
6919
7021
|
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
6920
7022
|
} else {
|
6921
|
-
model.buft_output = llama_default_buffer_type_cpu(true);
|
7023
|
+
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
6922
7024
|
}
|
6923
7025
|
} else {
|
6924
7026
|
lm_ggml_backend_buffer_type_t split_buft;
|
@@ -6942,7 +7044,7 @@ static bool llm_load_tensors(
|
|
6942
7044
|
llama_default_buffer_type_offload(model, main_gpu)
|
6943
7045
|
};
|
6944
7046
|
} else {
|
6945
|
-
model.buft_output = llama_default_buffer_type_cpu(true);
|
7047
|
+
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
6946
7048
|
}
|
6947
7049
|
}
|
6948
7050
|
|
@@ -7362,6 +7464,12 @@ static bool llm_load_tensors(
|
|
7362
7464
|
|
7363
7465
|
if (model.arch == LLM_ARCH_BERT) {
|
7364
7466
|
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
|
7467
|
+
|
7468
|
+
model.cls = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7469
|
+
model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7470
|
+
|
7471
|
+
model.cls_out = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7472
|
+
model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7365
7473
|
}
|
7366
7474
|
|
7367
7475
|
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
@@ -7414,6 +7522,8 @@ static bool llm_load_tensors(
|
|
7414
7522
|
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
7415
7523
|
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
7416
7524
|
|
7525
|
+
model.cls = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7526
|
+
model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
7417
7527
|
for (int i = 0; i < n_layer; ++i) {
|
7418
7528
|
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7419
7529
|
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
@@ -8739,6 +8849,45 @@ static bool llm_load_tensors(
|
|
8739
8849
|
}
|
8740
8850
|
|
8741
8851
|
} break;
|
8852
|
+
case LLM_ARCH_CHAMELEON:
|
8853
|
+
{
|
8854
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
8855
|
+
|
8856
|
+
// output
|
8857
|
+
{
|
8858
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
8859
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8860
|
+
|
8861
|
+
// if output is NULL, init from the input tok embed
|
8862
|
+
if (model.output == NULL) {
|
8863
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
8864
|
+
}
|
8865
|
+
}
|
8866
|
+
|
8867
|
+
for (int i = 0; i < n_layer; ++i) {
|
8868
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
8869
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
8870
|
+
|
8871
|
+
auto & layer = model.layers[i];
|
8872
|
+
|
8873
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
8874
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
|
8875
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
|
8876
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8877
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8878
|
+
|
8879
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
8880
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
8881
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
8882
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
8883
|
+
|
8884
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
8885
|
+
|
8886
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
8887
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
8888
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
8889
|
+
}
|
8890
|
+
} break;
|
8742
8891
|
default:
|
8743
8892
|
throw std::runtime_error("unknown architecture");
|
8744
8893
|
}
|
@@ -8764,55 +8913,40 @@ static bool llm_load_tensors(
|
|
8764
8913
|
llama_buf_map bufs;
|
8765
8914
|
bufs.reserve(n_max_backend_buffer);
|
8766
8915
|
|
8767
|
-
//
|
8768
|
-
//
|
8769
|
-
|
8770
|
-
|
8916
|
+
// check if this backend device supports buffer_from_host_ptr
|
8917
|
+
// when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
|
8918
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_buft_get_device(buft == llama_default_buffer_type_cpu(model, true) ? lm_ggml_backend_cpu_buffer_type() : buft);
|
8919
|
+
bool buffer_from_host_ptr_supported = false;
|
8920
|
+
if (dev) {
|
8921
|
+
lm_ggml_backend_dev_props props;
|
8922
|
+
lm_ggml_backend_dev_get_props(dev, &props);
|
8923
|
+
buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
8924
|
+
}
|
8925
|
+
|
8926
|
+
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) {
|
8771
8927
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
8928
|
+
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
8929
|
+
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
8930
|
+
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
8772
8931
|
void * addr = nullptr;
|
8773
|
-
size_t first, last;
|
8932
|
+
size_t first, last; // NOLINT
|
8774
8933
|
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
8775
8934
|
if (first >= last) {
|
8776
8935
|
continue;
|
8777
8936
|
}
|
8778
|
-
|
8937
|
+
const size_t max_size = lm_ggml_get_max_tensor_size(ctx);
|
8938
|
+
lm_ggml_backend_buffer_t buf = lm_ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
8779
8939
|
if (buf == nullptr) {
|
8780
|
-
throw std::runtime_error("unable to allocate
|
8940
|
+
throw std::runtime_error(format("unable to allocate %s buffer", lm_ggml_backend_buft_name(buft)));
|
8781
8941
|
}
|
8782
8942
|
model.bufs.push_back(buf);
|
8783
8943
|
bufs.emplace(idx, buf);
|
8784
|
-
#ifdef LM_GGML_USE_CUDA
|
8785
|
-
if (n_layer >= n_gpu_layers) {
|
8786
|
-
lm_ggml_backend_cuda_register_host_buffer(
|
8787
|
-
lm_ggml_backend_buffer_get_base(buf),
|
8788
|
-
lm_ggml_backend_buffer_get_size(buf));
|
8789
|
-
}
|
8790
|
-
#endif
|
8791
8944
|
}
|
8792
8945
|
}
|
8793
|
-
#ifdef LM_GGML_USE_METAL
|
8794
|
-
else if (ml.use_mmap && use_mmap_buffer && buft == lm_ggml_backend_metal_buffer_type()) {
|
8795
|
-
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
8796
|
-
const size_t max_size = lm_ggml_get_max_tensor_size(ctx);
|
8797
|
-
void * addr = nullptr;
|
8798
|
-
size_t first, last;
|
8799
|
-
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
8800
|
-
if (first >= last) {
|
8801
|
-
continue;
|
8802
|
-
}
|
8803
|
-
lm_ggml_backend_buffer_t buf = lm_ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
|
8804
|
-
if (buf == nullptr) {
|
8805
|
-
throw std::runtime_error("unable to allocate backend metal buffer");
|
8806
|
-
}
|
8807
|
-
model.bufs.push_back(buf);
|
8808
|
-
bufs.emplace(idx, buf);
|
8809
|
-
}
|
8810
|
-
}
|
8811
|
-
#endif
|
8812
8946
|
else {
|
8813
8947
|
lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
8814
8948
|
if (buf == nullptr) {
|
8815
|
-
throw std::runtime_error("unable to allocate
|
8949
|
+
throw std::runtime_error(format("unable to allocate %s buffer", lm_ggml_backend_buft_name(buft)));
|
8816
8950
|
}
|
8817
8951
|
model.bufs.push_back(buf);
|
8818
8952
|
if (use_mlock && lm_ggml_backend_buffer_is_host(buf)) {
|
@@ -10208,6 +10342,10 @@ struct llm_build_context {
|
|
10208
10342
|
struct lm_ggml_tensor * cur;
|
10209
10343
|
|
10210
10344
|
switch (pooling_type) {
|
10345
|
+
case LLAMA_POOLING_TYPE_NONE:
|
10346
|
+
{
|
10347
|
+
cur = inp;
|
10348
|
+
} break;
|
10211
10349
|
case LLAMA_POOLING_TYPE_MEAN:
|
10212
10350
|
{
|
10213
10351
|
struct lm_ggml_tensor * inp_mean = build_inp_mean();
|
@@ -10219,9 +10357,26 @@ struct llm_build_context {
|
|
10219
10357
|
struct lm_ggml_tensor * inp_cls = build_inp_cls();
|
10220
10358
|
cur = lm_ggml_get_rows(ctx0, inp, inp_cls);
|
10221
10359
|
} break;
|
10222
|
-
case
|
10360
|
+
case LLAMA_POOLING_TYPE_RANK:
|
10223
10361
|
{
|
10224
|
-
|
10362
|
+
struct lm_ggml_tensor * inp_cls = build_inp_cls();
|
10363
|
+
inp = lm_ggml_get_rows(ctx0, inp, inp_cls);
|
10364
|
+
|
10365
|
+
// classification head
|
10366
|
+
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
10367
|
+
LM_GGML_ASSERT(model.cls != nullptr);
|
10368
|
+
LM_GGML_ASSERT(model.cls_b != nullptr);
|
10369
|
+
|
10370
|
+
cur = lm_ggml_add (ctx0, lm_ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
|
10371
|
+
cur = lm_ggml_tanh(ctx0, cur);
|
10372
|
+
|
10373
|
+
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
10374
|
+
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
10375
|
+
if (model.cls_out) {
|
10376
|
+
LM_GGML_ASSERT(model.cls_out_b != nullptr);
|
10377
|
+
|
10378
|
+
cur = lm_ggml_add (ctx0, lm_ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
|
10379
|
+
}
|
10225
10380
|
} break;
|
10226
10381
|
default:
|
10227
10382
|
{
|
@@ -11450,8 +11605,8 @@ struct llm_build_context {
|
|
11450
11605
|
inpL = cur;
|
11451
11606
|
}
|
11452
11607
|
|
11453
|
-
// final output
|
11454
11608
|
cur = inpL;
|
11609
|
+
|
11455
11610
|
cb(cur, "result_embd", -1);
|
11456
11611
|
|
11457
11612
|
lm_ggml_build_forward_expand(gf, cur);
|
@@ -15883,6 +16038,184 @@ struct llm_build_context {
|
|
15883
16038
|
|
15884
16039
|
return gf;
|
15885
16040
|
}
|
16041
|
+
|
16042
|
+
// ref: https://github.com/facebookresearch/chameleon
|
16043
|
+
// based on the original build_llama() function, changes:
|
16044
|
+
// * qk-norm
|
16045
|
+
// * swin-norm
|
16046
|
+
// * removed bias
|
16047
|
+
// * removed MoE
|
16048
|
+
struct lm_ggml_cgraph * build_chameleon() {
|
16049
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
16050
|
+
|
16051
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
16052
|
+
int32_t n_tokens = this->n_tokens;
|
16053
|
+
|
16054
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
16055
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
16056
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
16057
|
+
|
16058
|
+
struct lm_ggml_tensor * cur;
|
16059
|
+
struct lm_ggml_tensor * inpL;
|
16060
|
+
|
16061
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
16062
|
+
|
16063
|
+
// inp_pos - contains the positions
|
16064
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
16065
|
+
|
16066
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
16067
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
16068
|
+
|
16069
|
+
for (int il = 0; il < n_layer; ++il) {
|
16070
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
16071
|
+
|
16072
|
+
// norm
|
16073
|
+
if (hparams.swin_norm) {
|
16074
|
+
cur = inpL;
|
16075
|
+
} else {
|
16076
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
16077
|
+
model.layers[il].attn_norm, NULL,
|
16078
|
+
LLM_NORM_RMS, cb, il);
|
16079
|
+
cb(cur, "attn_norm", il);
|
16080
|
+
}
|
16081
|
+
|
16082
|
+
// self-attention
|
16083
|
+
{
|
16084
|
+
// compute Q and K and RoPE them
|
16085
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
16086
|
+
cb(Qcur, "Qcur", il);
|
16087
|
+
|
16088
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
16089
|
+
cb(Kcur, "Kcur", il);
|
16090
|
+
|
16091
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
16092
|
+
cb(Vcur, "Vcur", il);
|
16093
|
+
|
16094
|
+
if (model.layers[il].attn_q_norm) {
|
16095
|
+
Qcur = lm_ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
16096
|
+
lm_ggml_element_size(Qcur) * n_embd_head,
|
16097
|
+
lm_ggml_element_size(Qcur) * n_embd_head * n_head,
|
16098
|
+
0);
|
16099
|
+
cb(Qcur, "Qcur", il);
|
16100
|
+
|
16101
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
16102
|
+
model.layers[il].attn_q_norm,
|
16103
|
+
model.layers[il].attn_q_norm_b,
|
16104
|
+
LLM_NORM, cb, il);
|
16105
|
+
cb(Qcur, "Qcur", il);
|
16106
|
+
}
|
16107
|
+
|
16108
|
+
if (model.layers[il].attn_k_norm) {
|
16109
|
+
Kcur = lm_ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
16110
|
+
lm_ggml_element_size(Kcur) * n_embd_head,
|
16111
|
+
lm_ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
16112
|
+
0);
|
16113
|
+
cb(Kcur, "Kcur", il);
|
16114
|
+
|
16115
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
16116
|
+
model.layers[il].attn_k_norm,
|
16117
|
+
model.layers[il].attn_k_norm_b,
|
16118
|
+
LLM_NORM, cb, il);
|
16119
|
+
cb(Kcur, "Kcur", il);
|
16120
|
+
}
|
16121
|
+
|
16122
|
+
Qcur = lm_ggml_rope_ext(
|
16123
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
16124
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
16125
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
16126
|
+
);
|
16127
|
+
cb(Qcur, "Qcur", il);
|
16128
|
+
|
16129
|
+
Kcur = lm_ggml_rope_ext(
|
16130
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
16131
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
16132
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
16133
|
+
);
|
16134
|
+
cb(Kcur, "Kcur", il);
|
16135
|
+
|
16136
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
16137
|
+
model.layers[il].wo, nullptr,
|
16138
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
16139
|
+
|
16140
|
+
if (hparams.swin_norm) {
|
16141
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
16142
|
+
model.layers[il].attn_norm, NULL,
|
16143
|
+
LLM_NORM_RMS, cb, il);
|
16144
|
+
}
|
16145
|
+
}
|
16146
|
+
|
16147
|
+
if (il == n_layer - 1) {
|
16148
|
+
// skip computing output for unused tokens
|
16149
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
16150
|
+
n_tokens = n_outputs;
|
16151
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
16152
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
16153
|
+
}
|
16154
|
+
|
16155
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
16156
|
+
cb(ffn_inp, "ffn_inp", il);
|
16157
|
+
|
16158
|
+
// feed-forward network
|
16159
|
+
if (!hparams.swin_norm) {
|
16160
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
16161
|
+
model.layers[il].ffn_norm, NULL,
|
16162
|
+
LLM_NORM_RMS, cb, il);
|
16163
|
+
cb(cur, "ffn_norm", il);
|
16164
|
+
}
|
16165
|
+
|
16166
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
16167
|
+
model.layers[il].ffn_up, NULL, NULL,
|
16168
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
16169
|
+
model.layers[il].ffn_down, NULL, NULL,
|
16170
|
+
NULL,
|
16171
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
16172
|
+
cb(cur, "ffn_out", il);
|
16173
|
+
|
16174
|
+
if (hparams.swin_norm) {
|
16175
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
16176
|
+
model.layers[il].ffn_norm, NULL,
|
16177
|
+
LLM_NORM_RMS, cb, il);
|
16178
|
+
cb(cur, "ffn_norm", il);
|
16179
|
+
}
|
16180
|
+
|
16181
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
16182
|
+
cb(cur, "ffn_out", il);
|
16183
|
+
|
16184
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
16185
|
+
cb(cur, "l_out", il);
|
16186
|
+
|
16187
|
+
// input for next layer
|
16188
|
+
inpL = cur;
|
16189
|
+
}
|
16190
|
+
|
16191
|
+
cur = inpL;
|
16192
|
+
|
16193
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
16194
|
+
model.output_norm, NULL,
|
16195
|
+
LLM_NORM_RMS, cb, -1);
|
16196
|
+
cb(cur, "result_norm", -1);
|
16197
|
+
|
16198
|
+
// lm_head
|
16199
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
16200
|
+
cb(cur, "result_output_with_img_logits", -1);
|
16201
|
+
|
16202
|
+
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
|
16203
|
+
// Needs to be removed once image outputs are supported.
|
16204
|
+
int img_token_end_idx = 8196;
|
16205
|
+
int img_token_start_idx = 4;
|
16206
|
+
int num_img_tokens = img_token_end_idx - img_token_start_idx;
|
16207
|
+
// creates 1d tensor of size num_img_tokens and values -FLT_MAX,
|
16208
|
+
// which ensures that text token values are always at least larger than image token values
|
16209
|
+
struct lm_ggml_tensor * img_logits = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, num_img_tokens);
|
16210
|
+
img_logits = lm_ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
|
16211
|
+
cb(img_logits, "img_logits", -1);
|
16212
|
+
cur = lm_ggml_set_1d(ctx0, cur, img_logits, lm_ggml_element_size(cur) * img_token_start_idx);
|
16213
|
+
cb(cur, "result_output", -1);
|
16214
|
+
|
16215
|
+
lm_ggml_build_forward_expand(gf, cur);
|
16216
|
+
|
16217
|
+
return gf;
|
16218
|
+
}
|
15886
16219
|
};
|
15887
16220
|
|
15888
16221
|
static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -16143,6 +16476,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16143
16476
|
{
|
16144
16477
|
result = llm.build_rwkv6();
|
16145
16478
|
} break;
|
16479
|
+
case LLM_ARCH_CHAMELEON:
|
16480
|
+
{
|
16481
|
+
result = llm.build_chameleon();
|
16482
|
+
} break;
|
16146
16483
|
default:
|
16147
16484
|
LM_GGML_ABORT("fatal error");
|
16148
16485
|
}
|
@@ -16429,7 +16766,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|
16429
16766
|
}
|
16430
16767
|
}
|
16431
16768
|
|
16432
|
-
if (cparams.embeddings &&
|
16769
|
+
if (cparams.embeddings && (
|
16770
|
+
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
|
16771
|
+
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
|
16433
16772
|
const int64_t n_tokens = batch.n_tokens;
|
16434
16773
|
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
16435
16774
|
const int64_t n_seqs = batch.n_seqs;
|
@@ -16444,7 +16783,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|
16444
16783
|
const llama_seq_id seq_id = batch.seq_id[s][0];
|
16445
16784
|
|
16446
16785
|
// TODO: adapt limits to n_seqs when batch.equal_seqs is true
|
16447
|
-
LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
16786
|
+
LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
|
16448
16787
|
|
16449
16788
|
for (int i = 0; i < n_seq_tokens; ++i) {
|
16450
16789
|
const llama_pos pos = batch.pos[s*n_seq_tokens + i];
|
@@ -16646,7 +16985,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|
16646
16985
|
lctx.embd = nullptr;
|
16647
16986
|
}
|
16648
16987
|
|
16649
|
-
lctx.buf_output = lm_ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
16988
|
+
lctx.buf_output = lm_ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
|
16650
16989
|
if (lctx.buf_output == nullptr) {
|
16651
16990
|
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
16652
16991
|
return 0;
|
@@ -16715,24 +17054,20 @@ static void llama_graph_compute(
|
|
16715
17054
|
lm_ggml_cgraph * gf,
|
16716
17055
|
int n_threads,
|
16717
17056
|
lm_ggml_threadpool * threadpool) {
|
16718
|
-
#ifdef LM_GGML_USE_METAL
|
16719
|
-
if (lm_ggml_backend_is_metal(lctx.backend_metal)) {
|
16720
|
-
lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
16721
|
-
}
|
16722
|
-
#endif
|
16723
|
-
|
16724
17057
|
if (lctx.backend_cpu != nullptr) {
|
16725
|
-
lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
16726
17058
|
lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
16727
17059
|
lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
16728
17060
|
}
|
16729
|
-
|
16730
|
-
|
16731
|
-
|
17061
|
+
|
17062
|
+
// set the number of threads for all the backends
|
17063
|
+
for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
|
17064
|
+
set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
|
16732
17065
|
}
|
16733
|
-
#endif
|
16734
17066
|
|
16735
|
-
lm_ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
17067
|
+
auto err = lm_ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
17068
|
+
if (err != LM_GGML_STATUS_SUCCESS) {
|
17069
|
+
LLAMA_LOG_ERROR("%s: lm_ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
|
17070
|
+
}
|
16736
17071
|
|
16737
17072
|
// fprintf(stderr, "splits: %d\n", lm_ggml_backend_sched_get_n_splits(lctx.sched));
|
16738
17073
|
}
|
@@ -16984,6 +17319,20 @@ static int llama_decode_internal(
|
|
16984
17319
|
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
16985
17320
|
}
|
16986
17321
|
} break;
|
17322
|
+
case LLAMA_POOLING_TYPE_RANK:
|
17323
|
+
{
|
17324
|
+
// extract the rerank score - a single float per sequence
|
17325
|
+
auto & embd_seq_out = lctx.embd_seq;
|
17326
|
+
|
17327
|
+
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
17328
|
+
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
17329
|
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
17330
|
+
continue;
|
17331
|
+
}
|
17332
|
+
embd_seq_out[seq_id].resize(1);
|
17333
|
+
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
|
17334
|
+
}
|
17335
|
+
} break;
|
16987
17336
|
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
16988
17337
|
{
|
16989
17338
|
LM_GGML_ABORT("unknown pooling type");
|
@@ -17190,6 +17539,13 @@ static int llama_encode_internal(
|
|
17190
17539
|
lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
17191
17540
|
}
|
17192
17541
|
} break;
|
17542
|
+
case LLAMA_POOLING_TYPE_RANK:
|
17543
|
+
{
|
17544
|
+
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
|
17545
|
+
// wait for an encoder model that requires this pooling type in order to test it
|
17546
|
+
// https://github.com/ggerganov/llama.cpp/pull/9510
|
17547
|
+
LM_GGML_ABORT("RANK pooling not implemented yet");
|
17548
|
+
}
|
17193
17549
|
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
17194
17550
|
{
|
17195
17551
|
LM_GGML_ABORT("unknown pooling type");
|
@@ -17527,10 +17883,9 @@ static void llama_tensor_dequantize_internal(
|
|
17527
17883
|
}
|
17528
17884
|
float * f32_output = (float *) output.data();
|
17529
17885
|
|
17530
|
-
|
17886
|
+
const lm_ggml_type_traits * qtype = lm_ggml_get_type_traits(tensor->type);
|
17531
17887
|
if (lm_ggml_is_quantized(tensor->type)) {
|
17532
|
-
qtype
|
17533
|
-
if (qtype.to_float == NULL) {
|
17888
|
+
if (qtype->to_float == NULL) {
|
17534
17889
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", lm_ggml_type_name(tensor->type)));
|
17535
17890
|
}
|
17536
17891
|
} else if (tensor->type != LM_GGML_TYPE_F16 &&
|
@@ -17544,7 +17899,7 @@ static void llama_tensor_dequantize_internal(
|
|
17544
17899
|
} else if (tensor->type == LM_GGML_TYPE_BF16) {
|
17545
17900
|
lm_ggml_bf16_to_fp32_row((lm_ggml_bf16_t *)tensor->data, f32_output, nelements);
|
17546
17901
|
} else if (lm_ggml_is_quantized(tensor->type)) {
|
17547
|
-
qtype
|
17902
|
+
qtype->to_float(tensor->data, f32_output, nelements);
|
17548
17903
|
} else {
|
17549
17904
|
LM_GGML_ABORT("fatal error"); // unreachable
|
17550
17905
|
}
|
@@ -17580,7 +17935,7 @@ static void llama_tensor_dequantize_internal(
|
|
17580
17935
|
} else if (typ == LM_GGML_TYPE_BF16) {
|
17581
17936
|
lm_ggml_bf16_to_fp32_row((lm_ggml_bf16_t *)inbuf, outbuf, nels);
|
17582
17937
|
} else {
|
17583
|
-
qtype
|
17938
|
+
qtype->to_float(inbuf, outbuf, nels);
|
17584
17939
|
}
|
17585
17940
|
};
|
17586
17941
|
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
@@ -18662,21 +19017,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
18662
19017
|
}
|
18663
19018
|
|
18664
19019
|
size_t llama_max_devices(void) {
|
18665
|
-
|
18666
|
-
return LM_GGML_RPC_MAX_SERVERS;
|
18667
|
-
#elif defined(LM_GGML_USE_METAL)
|
18668
|
-
return 1;
|
18669
|
-
#elif defined(LM_GGML_USE_CUDA)
|
18670
|
-
return LM_GGML_CUDA_MAX_DEVICES;
|
18671
|
-
#elif defined(LM_GGML_USE_SYCL)
|
18672
|
-
return LM_GGML_SYCL_MAX_DEVICES;
|
18673
|
-
#elif defined(LM_GGML_USE_VULKAN)
|
18674
|
-
return LM_GGML_VK_MAX_DEVICES;
|
18675
|
-
#elif defined(LM_GGML_USE_CANN)
|
18676
|
-
return LM_GGML_CANN_MAX_DEVICES;
|
18677
|
-
#else
|
18678
|
-
return 1;
|
18679
|
-
#endif
|
19020
|
+
return 16;
|
18680
19021
|
}
|
18681
19022
|
|
18682
19023
|
bool llama_supports_mmap(void) {
|
@@ -18688,12 +19029,13 @@ bool llama_supports_mlock(void) {
|
|
18688
19029
|
}
|
18689
19030
|
|
18690
19031
|
bool llama_supports_gpu_offload(void) {
|
18691
|
-
#if defined(
|
19032
|
+
#if defined(LM_GGML_USE_VULKAN) || \
|
18692
19033
|
defined(LM_GGML_USE_SYCL) || defined(LM_GGML_USE_KOMPUTE) || defined(LM_GGML_USE_RPC)
|
18693
19034
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
18694
19035
|
return true;
|
18695
19036
|
#else
|
18696
|
-
return
|
19037
|
+
return lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
19038
|
+
lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
|
18697
19039
|
#endif
|
18698
19040
|
}
|
18699
19041
|
|
@@ -18758,17 +19100,37 @@ struct llama_model * llama_load_model_from_file(
|
|
18758
19100
|
return true;
|
18759
19101
|
};
|
18760
19102
|
}
|
19103
|
+
|
18761
19104
|
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
18762
19105
|
// split the servers set them into model->rpc_servers
|
18763
19106
|
std::string servers(params.rpc_servers);
|
18764
19107
|
size_t pos = 0;
|
18765
|
-
while ((pos = servers.find(
|
19108
|
+
while ((pos = servers.find(',')) != std::string::npos) {
|
18766
19109
|
std::string server = servers.substr(0, pos);
|
18767
19110
|
model->rpc_servers.push_back(server);
|
18768
19111
|
servers.erase(0, pos + 1);
|
18769
19112
|
}
|
18770
19113
|
model->rpc_servers.push_back(servers);
|
18771
19114
|
}
|
19115
|
+
|
19116
|
+
// create list of devices to use with this model
|
19117
|
+
// currently, we use all available devices
|
19118
|
+
// TODO: rework API to give user more control over device selection
|
19119
|
+
for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
|
19120
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
|
19121
|
+
switch (lm_ggml_backend_dev_type(dev)) {
|
19122
|
+
case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
|
19123
|
+
case LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL:
|
19124
|
+
// skip CPU backends since they are `handled separately
|
19125
|
+
break;
|
19126
|
+
|
19127
|
+
case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
|
19128
|
+
case LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
|
19129
|
+
model->devices.push_back(dev);
|
19130
|
+
break;
|
19131
|
+
}
|
19132
|
+
}
|
19133
|
+
|
18772
19134
|
int status = llama_model_load(path_model, *model, params);
|
18773
19135
|
LM_GGML_ASSERT(status <= 0);
|
18774
19136
|
if (status < 0) {
|
@@ -18930,60 +19292,61 @@ struct llama_context * llama_new_context_with_model(
|
|
18930
19292
|
|
18931
19293
|
if (!hparams.vocab_only) {
|
18932
19294
|
// initialize backends
|
18933
|
-
|
18934
|
-
|
18935
|
-
|
18936
|
-
|
19295
|
+
int main_gpu = model->main_gpu;
|
19296
|
+
|
19297
|
+
// with registry
|
19298
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
19299
|
+
if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
|
19300
|
+
lm_ggml_backend_dev_t main_dev = model->devices[main_gpu];
|
19301
|
+
lm_ggml_backend_t backend = lm_ggml_backend_dev_init(main_dev, nullptr);
|
18937
19302
|
if (backend == nullptr) {
|
18938
|
-
LLAMA_LOG_ERROR("%s: failed to initialize
|
19303
|
+
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, lm_ggml_backend_dev_name(main_dev));
|
19304
|
+
llama_free(ctx);
|
19305
|
+
return nullptr;
|
19306
|
+
}
|
19307
|
+
ctx->backends.push_back(backend);
|
19308
|
+
}
|
19309
|
+
} else {
|
19310
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
19311
|
+
for (auto * dev : model->devices) {
|
19312
|
+
lm_ggml_backend_t backend = lm_ggml_backend_dev_init(dev, nullptr);
|
19313
|
+
if (backend == nullptr) {
|
19314
|
+
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, lm_ggml_backend_dev_name(dev));
|
18939
19315
|
llama_free(ctx);
|
18940
19316
|
return nullptr;
|
18941
19317
|
}
|
18942
19318
|
ctx->backends.push_back(backend);
|
18943
19319
|
}
|
18944
19320
|
}
|
18945
|
-
|
19321
|
+
if (main_gpu >= (int)model->devices.size()) {
|
19322
|
+
main_gpu -= (int)model->devices.size();
|
19323
|
+
}
|
18946
19324
|
|
18947
|
-
#if defined(
|
19325
|
+
#if defined(LM_GGML_USE_RPC)
|
18948
19326
|
if (model->n_gpu_layers > 0) {
|
18949
|
-
|
18950
|
-
|
18951
|
-
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
18952
|
-
llama_free(ctx);
|
18953
|
-
return nullptr;
|
18954
|
-
}
|
18955
|
-
ctx->backends.push_back(ctx->backend_metal);
|
18956
|
-
}
|
18957
|
-
#elif defined(LM_GGML_USE_CUDA)
|
18958
|
-
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
18959
|
-
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
18960
|
-
lm_ggml_backend_t backend = lm_ggml_backend_cuda_init(model->main_gpu);
|
18961
|
-
if (backend == nullptr) {
|
18962
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
18963
|
-
llama_free(ctx);
|
18964
|
-
return nullptr;
|
18965
|
-
}
|
18966
|
-
ctx->backends.push_back(backend);
|
18967
|
-
} else {
|
18968
|
-
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
18969
|
-
for (int device = 0; device < lm_ggml_backend_cuda_get_device_count(); ++device) {
|
18970
|
-
lm_ggml_backend_t backend = lm_ggml_backend_cuda_init(device);
|
19327
|
+
for (const auto & endpoint : model->rpc_servers) {
|
19328
|
+
lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
|
18971
19329
|
if (backend == nullptr) {
|
18972
|
-
LLAMA_LOG_ERROR("%s: failed to initialize
|
19330
|
+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
18973
19331
|
llama_free(ctx);
|
18974
19332
|
return nullptr;
|
18975
19333
|
}
|
18976
19334
|
ctx->backends.push_back(backend);
|
18977
19335
|
}
|
18978
19336
|
}
|
18979
|
-
|
19337
|
+
if (main_gpu >= (int)model->rpc_servers.size()) {
|
19338
|
+
main_gpu -= (int)model->rpc_servers.size();
|
19339
|
+
}
|
19340
|
+
#endif
|
19341
|
+
|
19342
|
+
#if defined(LM_GGML_USE_VULKAN)
|
18980
19343
|
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
18981
19344
|
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
18982
19345
|
llama_free(ctx);
|
18983
19346
|
return nullptr;
|
18984
19347
|
}
|
18985
19348
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
18986
|
-
lm_ggml_backend_t backend = lm_ggml_backend_vk_init(
|
19349
|
+
lm_ggml_backend_t backend = lm_ggml_backend_vk_init(main_gpu);
|
18987
19350
|
if (backend == nullptr) {
|
18988
19351
|
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
18989
19352
|
llama_free(ctx);
|
@@ -19004,9 +19367,9 @@ struct llama_context * llama_new_context_with_model(
|
|
19004
19367
|
#elif defined(LM_GGML_USE_SYCL)
|
19005
19368
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
19006
19369
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
19007
|
-
lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(
|
19370
|
+
lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(main_gpu);
|
19008
19371
|
if (backend == nullptr) {
|
19009
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__,
|
19372
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
|
19010
19373
|
llama_free(ctx);
|
19011
19374
|
return nullptr;
|
19012
19375
|
}
|
@@ -19025,7 +19388,7 @@ struct llama_context * llama_new_context_with_model(
|
|
19025
19388
|
}
|
19026
19389
|
#elif defined(LM_GGML_USE_KOMPUTE)
|
19027
19390
|
if (model->n_gpu_layers > 0) {
|
19028
|
-
auto * backend = lm_ggml_backend_kompute_init(
|
19391
|
+
auto * backend = lm_ggml_backend_kompute_init(main_gpu);
|
19029
19392
|
if (backend == nullptr) {
|
19030
19393
|
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
19031
19394
|
llama_free(ctx);
|
@@ -19034,39 +19397,44 @@ struct llama_context * llama_new_context_with_model(
|
|
19034
19397
|
ctx->backends.push_back(backend);
|
19035
19398
|
}
|
19036
19399
|
#elif defined(LM_GGML_USE_CANN)
|
19037
|
-
|
19038
|
-
|
19039
|
-
|
19040
|
-
|
19041
|
-
if (backend == nullptr) {
|
19042
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
|
19043
|
-
llama_free(ctx);
|
19044
|
-
return nullptr;
|
19045
|
-
}
|
19046
|
-
ctx->backends.push_back(backend);
|
19047
|
-
} else {
|
19048
|
-
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
19049
|
-
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
|
19050
|
-
for (int32_t device = 0; device < lm_ggml_backend_cann_get_device_count(); ++device) {
|
19051
|
-
lm_ggml_backend_t backend = lm_ggml_backend_cann_init(device);
|
19400
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
19401
|
+
// TODO: lm_ggml_backend_cann is not support split tensor now, just leave code here.
|
19402
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
19403
|
+
lm_ggml_backend_t backend = lm_ggml_backend_cann_init(main_gpu);
|
19052
19404
|
if (backend == nullptr) {
|
19053
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__,
|
19405
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
|
19054
19406
|
llama_free(ctx);
|
19055
19407
|
return nullptr;
|
19056
19408
|
}
|
19057
19409
|
ctx->backends.push_back(backend);
|
19410
|
+
} else {
|
19411
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
19412
|
+
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
|
19413
|
+
for (int32_t device = 0; device < lm_ggml_backend_cann_get_device_count(); ++device) {
|
19414
|
+
lm_ggml_backend_t backend = lm_ggml_backend_cann_init(device);
|
19415
|
+
if (backend == nullptr) {
|
19416
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
|
19417
|
+
llama_free(ctx);
|
19418
|
+
return nullptr;
|
19419
|
+
}
|
19420
|
+
ctx->backends.push_back(backend);
|
19421
|
+
}
|
19058
19422
|
}
|
19059
|
-
}
|
19060
19423
|
#endif
|
19061
19424
|
|
19062
|
-
|
19063
|
-
|
19064
|
-
|
19065
|
-
|
19066
|
-
|
19067
|
-
|
19425
|
+
// add other backends (such as BLAS)
|
19426
|
+
for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
|
19427
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
|
19428
|
+
if (lm_ggml_backend_dev_type(dev) == LM_GGML_BACKEND_DEVICE_TYPE_CPU) {
|
19429
|
+
lm_ggml_backend_t backend = lm_ggml_backend_dev_init(dev, nullptr);
|
19430
|
+
if (backend == nullptr) {
|
19431
|
+
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, lm_ggml_backend_dev_name(dev));
|
19432
|
+
llama_free(ctx);
|
19433
|
+
return nullptr;
|
19434
|
+
}
|
19435
|
+
ctx->backends.push_back(backend);
|
19436
|
+
}
|
19068
19437
|
}
|
19069
|
-
#endif
|
19070
19438
|
|
19071
19439
|
ctx->backend_cpu = lm_ggml_backend_cpu_init();
|
19072
19440
|
if (ctx->backend_cpu == nullptr) {
|
@@ -19076,6 +19444,18 @@ struct llama_context * llama_new_context_with_model(
|
|
19076
19444
|
}
|
19077
19445
|
ctx->backends.push_back(ctx->backend_cpu);
|
19078
19446
|
|
19447
|
+
// create a list of the set_n_threads functions in the backends
|
19448
|
+
for (auto * backend : ctx->backends) {
|
19449
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_get_device(backend);
|
19450
|
+
lm_ggml_backend_reg_t reg = dev ? lm_ggml_backend_dev_backend_reg(dev) : nullptr;
|
19451
|
+
if (reg) {
|
19452
|
+
auto lm_ggml_backend_set_n_threads_fn = (lm_ggml_backend_set_n_threads_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_set_n_threads");
|
19453
|
+
if (lm_ggml_backend_set_n_threads_fn) {
|
19454
|
+
ctx->set_n_threads_fns.emplace_back(backend, lm_ggml_backend_set_n_threads_fn);
|
19455
|
+
}
|
19456
|
+
}
|
19457
|
+
}
|
19458
|
+
|
19079
19459
|
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
19080
19460
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
19081
19461
|
llama_free(ctx);
|
@@ -19121,7 +19501,7 @@ struct llama_context * llama_new_context_with_model(
|
|
19121
19501
|
for (auto * backend : ctx->backends) {
|
19122
19502
|
if (lm_ggml_backend_is_cpu(backend)) {
|
19123
19503
|
// use host buffers for the CPU backend compute buffer
|
19124
|
-
backend_buft.push_back(llama_default_buffer_type_cpu(true));
|
19504
|
+
backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
|
19125
19505
|
} else {
|
19126
19506
|
backend_buft.push_back(lm_ggml_backend_get_default_buffer_type(backend));
|
19127
19507
|
}
|
@@ -19132,17 +19512,37 @@ struct llama_context * llama_new_context_with_model(
|
|
19132
19512
|
// buffer used to store the computation graph and the tensor meta data
|
19133
19513
|
ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*max_nodes + lm_ggml_graph_overhead_custom(max_nodes, false));
|
19134
19514
|
|
19515
|
+
// TODO: move these checks to lm_ggml_backend_sched
|
19135
19516
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
19136
19517
|
bool pipeline_parallel =
|
19137
19518
|
llama_get_device_count(*model) > 1 &&
|
19138
19519
|
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
19139
19520
|
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
19140
19521
|
params.offload_kqv;
|
19141
|
-
|
19142
|
-
// pipeline parallelism requires support for async compute and events
|
19143
|
-
|
19144
|
-
|
19145
|
-
|
19522
|
+
|
19523
|
+
// pipeline parallelism requires support for async compute and events in all devices
|
19524
|
+
if (pipeline_parallel) {
|
19525
|
+
for (auto * backend : ctx->backends) {
|
19526
|
+
if (lm_ggml_backend_is_cpu(backend)) {
|
19527
|
+
// ignore CPU backend
|
19528
|
+
continue;
|
19529
|
+
}
|
19530
|
+
auto * dev = lm_ggml_backend_get_device(backend);
|
19531
|
+
if (!dev) {
|
19532
|
+
// backend is using old interface, not supported
|
19533
|
+
pipeline_parallel = false;
|
19534
|
+
break;
|
19535
|
+
}
|
19536
|
+
lm_ggml_backend_dev_props props;
|
19537
|
+
lm_ggml_backend_dev_get_props(dev, &props);
|
19538
|
+
if (!props.caps.async || !props.caps.events) {
|
19539
|
+
// device does not support async compute or events
|
19540
|
+
pipeline_parallel = false;
|
19541
|
+
break;
|
19542
|
+
}
|
19543
|
+
}
|
19544
|
+
}
|
19545
|
+
|
19146
19546
|
ctx->sched = lm_ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
|
19147
19547
|
|
19148
19548
|
if (pipeline_parallel) {
|
@@ -19268,6 +19668,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
19268
19668
|
case LLM_ARCH_CHATGLM:
|
19269
19669
|
case LLM_ARCH_GRANITE:
|
19270
19670
|
case LLM_ARCH_GRANITE_MOE:
|
19671
|
+
case LLM_ARCH_CHAMELEON:
|
19271
19672
|
return LLAMA_ROPE_TYPE_NORM;
|
19272
19673
|
|
19273
19674
|
// the pairs of head values are offset by n_rot/2
|
@@ -21446,15 +21847,9 @@ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_inter
|
|
21446
21847
|
}
|
21447
21848
|
|
21448
21849
|
void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) {
|
21449
|
-
|
21450
|
-
|
21451
|
-
|
21452
|
-
lm_ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
21453
|
-
#elif defined(LM_GGML_USE_CUDA)
|
21454
|
-
lm_ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
21455
|
-
#elif defined(LM_GGML_USE_CANN)
|
21456
|
-
lm_ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
21457
|
-
#endif
|
21850
|
+
lm_ggml_log_set(log_callback, user_data);
|
21851
|
+
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
21852
|
+
g_logger_state.log_callback_user_data = user_data;
|
21458
21853
|
}
|
21459
21854
|
|
21460
21855
|
static void llama_log_internal_v(lm_ggml_log_level level, const char * format, va_list args) {
|
@@ -21463,12 +21858,12 @@ static void llama_log_internal_v(lm_ggml_log_level level, const char * format, v
|
|
21463
21858
|
char buffer[128];
|
21464
21859
|
int len = vsnprintf(buffer, 128, format, args);
|
21465
21860
|
if (len < 128) {
|
21466
|
-
|
21861
|
+
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
|
21467
21862
|
} else {
|
21468
21863
|
char * buffer2 = new char[len + 1];
|
21469
21864
|
vsnprintf(buffer2, len + 1, format, args_copy);
|
21470
21865
|
buffer2[len] = 0;
|
21471
|
-
|
21866
|
+
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
|
21472
21867
|
delete[] buffer2;
|
21473
21868
|
}
|
21474
21869
|
va_end(args_copy);
|