cui-llama.rn 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +2 -0
  2. package/android/src/main/CMakeLists.txt +2 -2
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +39 -0
  5. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +5 -0
  6. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +5 -0
  7. package/cpp/common.cpp +36 -1
  8. package/cpp/common.h +5 -1
  9. package/cpp/ggml-aarch64.c +2 -11
  10. package/cpp/ggml-alloc.h +1 -1
  11. package/cpp/ggml-backend-impl.h +151 -78
  12. package/cpp/{ggml-backend.c → ggml-backend.cpp} +565 -269
  13. package/cpp/ggml-backend.h +147 -62
  14. package/cpp/ggml-impl.h +15 -0
  15. package/cpp/ggml-metal.h +8 -9
  16. package/cpp/ggml-metal.m +2428 -2111
  17. package/cpp/ggml-quants.c +2 -2
  18. package/cpp/ggml-quants.h +0 -4
  19. package/cpp/ggml.c +799 -1121
  20. package/cpp/ggml.h +79 -72
  21. package/cpp/llama-vocab.cpp +189 -106
  22. package/cpp/llama-vocab.h +18 -9
  23. package/cpp/llama.cpp +736 -341
  24. package/cpp/llama.h +9 -4
  25. package/cpp/unicode-data.cpp +6 -4
  26. package/cpp/unicode-data.h +4 -4
  27. package/cpp/unicode.cpp +14 -7
  28. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  29. package/lib/commonjs/index.js +4 -0
  30. package/lib/commonjs/index.js.map +1 -1
  31. package/lib/module/NativeRNLlama.js.map +1 -1
  32. package/lib/module/index.js +3 -0
  33. package/lib/module/index.js.map +1 -1
  34. package/lib/typescript/NativeRNLlama.d.ts +6 -0
  35. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  36. package/lib/typescript/index.d.ts +2 -1
  37. package/lib/typescript/index.d.ts.map +1 -1
  38. package/package.json +1 -1
  39. package/src/NativeRNLlama.ts +7 -0
  40. package/src/index.ts +5 -0
package/cpp/llama.cpp CHANGED
@@ -12,9 +12,7 @@
12
12
  # include "ggml-rpc.h"
13
13
  #endif
14
14
 
15
- #ifdef LM_GGML_USE_CUDA
16
- # include "ggml-cuda.h"
17
- #elif defined(LM_GGML_USE_VULKAN)
15
+ #if defined(LM_GGML_USE_VULKAN)
18
16
  # include "ggml-vulkan.h"
19
17
  #elif defined(LM_GGML_USE_SYCL)
20
18
  # include "ggml-sycl.h"
@@ -24,14 +22,6 @@
24
22
  # include "ggml-cann.h"
25
23
  #endif
26
24
 
27
- #ifdef LM_GGML_USE_BLAS
28
- # include "ggml-blas.h"
29
- #endif
30
-
31
- #ifdef LM_GGML_USE_METAL
32
- # include "ggml-metal.h"
33
- #endif
34
-
35
25
  // TODO: replace with ggml API call
36
26
  #define QK_K 256
37
27
 
@@ -227,6 +217,7 @@ enum llm_arch {
227
217
  LLM_ARCH_RWKV6,
228
218
  LLM_ARCH_GRANITE,
229
219
  LLM_ARCH_GRANITE_MOE,
220
+ LLM_ARCH_CHAMELEON,
230
221
  LLM_ARCH_UNKNOWN,
231
222
  };
232
223
 
@@ -279,6 +270,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
279
270
  { LLM_ARCH_RWKV6, "rwkv6" },
280
271
  { LLM_ARCH_GRANITE, "granite" },
281
272
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
273
+ { LLM_ARCH_CHAMELEON, "chameleon" },
282
274
  { LLM_ARCH_UNKNOWN, "(unknown)" },
283
275
  };
284
276
 
@@ -315,6 +307,7 @@ enum llm_kv {
315
307
  LLM_KV_DECODER_START_TOKEN_ID,
316
308
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
317
309
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
310
+ LLM_KV_SWIN_NORM,
318
311
  LLM_KV_RESCALE_EVERY_N_LAYERS,
319
312
  LLM_KV_TIME_MIX_EXTRA_DIM,
320
313
  LLM_KV_TIME_DECAY_EXTRA_DIM,
@@ -422,6 +415,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
422
415
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
423
416
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
424
417
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
418
+ { LLM_KV_SWIN_NORM, "%s.swin_norm" },
425
419
  { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
426
420
  { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
427
421
  { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
@@ -613,9 +607,11 @@ enum llm_tensor {
613
607
  LLM_TENSOR_ENC_FFN_DOWN,
614
608
  LLM_TENSOR_ENC_FFN_UP,
615
609
  LLM_TENSOR_ENC_OUTPUT_NORM,
610
+ LLM_TENSOR_CLS,
611
+ LLM_TENSOR_CLS_OUT,
616
612
  };
617
613
 
618
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
614
+ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
619
615
  {
620
616
  LLM_ARCH_LLAMA,
621
617
  {
@@ -800,6 +796,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
800
796
  { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
801
797
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
802
798
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
799
+ { LLM_TENSOR_CLS, "cls" },
800
+ { LLM_TENSOR_CLS_OUT, "cls.output" },
803
801
  },
804
802
  },
805
803
  {
@@ -835,6 +833,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
835
833
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
836
834
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
837
835
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
836
+ { LLM_TENSOR_CLS, "cls" },
838
837
  },
839
838
  },
840
839
  {
@@ -1510,6 +1509,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1510
1509
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1511
1510
  },
1512
1511
  },
1512
+ {
1513
+ LLM_ARCH_CHAMELEON,
1514
+ {
1515
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1516
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1517
+ { LLM_TENSOR_OUTPUT, "output" },
1518
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1519
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1520
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1521
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1522
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1523
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1524
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1525
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1526
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1527
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1528
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1529
+ },
1530
+ },
1513
1531
  {
1514
1532
  LLM_ARCH_UNKNOWN,
1515
1533
  {
@@ -1549,32 +1567,32 @@ struct LLM_TN {
1549
1567
  return LLM_TENSOR_NAMES.at(arch).at(tensor);
1550
1568
  }
1551
1569
 
1552
- std::string operator()(llm_tensor tensor, const std::string & suffix) const {
1570
+ std::string operator()(llm_tensor tensor, const char * suffix) const {
1553
1571
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1554
1572
  return "__missing__";
1555
1573
  }
1556
- return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
1574
+ return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix;
1557
1575
  }
1558
1576
 
1559
1577
  std::string operator()(llm_tensor tensor, int bid) const {
1560
1578
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1561
1579
  return "__missing__";
1562
1580
  }
1563
- return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
1581
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid);
1564
1582
  }
1565
1583
 
1566
- std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
1584
+ std::string operator()(llm_tensor tensor, const char * suffix, int bid) const {
1567
1585
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1568
1586
  return "__missing__";
1569
1587
  }
1570
- return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
1588
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix;
1571
1589
  }
1572
1590
 
1573
- std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
1591
+ std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const {
1574
1592
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1575
1593
  return "__missing__";
1576
1594
  }
1577
- return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
1595
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix;
1578
1596
  }
1579
1597
  };
1580
1598
 
@@ -2247,59 +2265,16 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
2247
2265
  return piece;
2248
2266
  }
2249
2267
 
2250
- static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
2251
- lm_ggml_backend_buffer_type_t buft = nullptr;
2252
-
2253
- #if defined(LM_GGML_USE_CUDA)
2254
- // host buffers should only be used when data is expected to be copied to/from the GPU
2255
- if (host_buffer) {
2256
- buft = lm_ggml_backend_cuda_host_buffer_type();
2257
- }
2258
- #elif defined(LM_GGML_USE_SYCL)
2259
- if (host_buffer) {
2260
- buft = lm_ggml_backend_sycl_host_buffer_type();
2261
- }
2262
- #elif defined(LM_GGML_USE_CANN)
2263
- if (host_buffer) {
2264
- buft = lm_ggml_backend_cann_host_buffer_type();
2265
- }
2266
- #elif defined(LM_GGML_USE_CPU_HBM)
2267
- buft = lm_ggml_backend_cpu_hbm_buffer_type();
2268
- #elif defined(LM_GGML_USE_VULKAN)
2269
- if (host_buffer) {
2270
- buft = lm_ggml_backend_vk_host_buffer_type();
2271
- }
2272
- #endif
2273
-
2274
- if (buft == nullptr) {
2275
- buft = lm_ggml_backend_cpu_buffer_type();
2276
- }
2277
- return buft;
2278
-
2279
- LM_GGML_UNUSED(host_buffer);
2280
- }
2281
-
2282
2268
  //
2283
2269
  // globals
2284
2270
  //
2285
2271
 
2286
- struct llama_state {
2287
- llama_state() {
2288
- #ifdef LM_GGML_USE_METAL
2289
- lm_ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
2290
- #elif defined(LM_GGML_USE_CUDA)
2291
- lm_ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
2292
- #elif defined(LM_GGML_USE_CANN)
2293
- lm_ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
2294
- #endif
2295
- }
2296
-
2297
- // We save the log callback globally
2272
+ struct llama_logger_state {
2298
2273
  lm_ggml_log_callback log_callback = llama_log_callback_default;
2299
2274
  void * log_callback_user_data = nullptr;
2300
2275
  };
2301
2276
 
2302
- static llama_state g_state;
2277
+ static llama_logger_state g_logger_state;
2303
2278
 
2304
2279
  // available llama models
2305
2280
  enum e_model {
@@ -2373,6 +2348,7 @@ struct llama_hparams {
2373
2348
  bool vocab_only;
2374
2349
  bool rope_finetuned;
2375
2350
  bool use_par_res;
2351
+ bool swin_norm;
2376
2352
 
2377
2353
  uint32_t n_vocab;
2378
2354
  uint32_t n_ctx_train; // context size the model was trained on
@@ -2439,7 +2415,7 @@ struct llama_hparams {
2439
2415
 
2440
2416
  // needed by encoder-decoder models (e.g. T5, FLAN-T5)
2441
2417
  // ref: https://github.com/ggerganov/llama.cpp/pull/8141
2442
- llama_token dec_start_token_id = -1;
2418
+ llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
2443
2419
 
2444
2420
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
2445
2421
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -2881,6 +2857,7 @@ struct llama_model {
2881
2857
  llama_hparams hparams = {};
2882
2858
  llama_vocab vocab;
2883
2859
 
2860
+ // TODO: should init all tensors to nullptr
2884
2861
  struct lm_ggml_tensor * tok_embd;
2885
2862
  struct lm_ggml_tensor * type_embd;
2886
2863
  struct lm_ggml_tensor * pos_embd;
@@ -2893,16 +2870,25 @@ struct llama_model {
2893
2870
  struct lm_ggml_tensor * output_b;
2894
2871
  struct lm_ggml_tensor * output_norm_enc;
2895
2872
 
2873
+ // classifier
2874
+ struct lm_ggml_tensor * cls;
2875
+ struct lm_ggml_tensor * cls_b;
2876
+ struct lm_ggml_tensor * cls_out = nullptr;
2877
+ struct lm_ggml_tensor * cls_out_b = nullptr;
2878
+
2896
2879
  std::vector<llama_layer> layers;
2897
2880
 
2881
+ // gguf metadata
2882
+ std::unordered_map<std::string, std::string> lm_gguf_kv;
2883
+
2898
2884
  llama_split_mode split_mode;
2899
2885
  int main_gpu;
2900
2886
  int n_gpu_layers;
2901
2887
 
2902
- std::vector<std::string> rpc_servers;
2888
+ // list of devices used in this model
2889
+ std::vector<lm_ggml_backend_dev_t> devices;
2903
2890
 
2904
- // gguf metadata
2905
- std::unordered_map<std::string, std::string> lm_gguf_kv;
2891
+ std::vector<std::string> rpc_servers;
2906
2892
 
2907
2893
  // layer -> buffer type mapping
2908
2894
  struct layer_buft {
@@ -2945,11 +2931,6 @@ struct llama_model {
2945
2931
  lm_ggml_free(ctx);
2946
2932
  }
2947
2933
  for (lm_ggml_backend_buffer_t buf : bufs) {
2948
- #ifdef LM_GGML_USE_CUDA
2949
- if (lm_ggml_backend_buffer_get_type(buf) == lm_ggml_backend_cpu_buffer_type()) {
2950
- lm_ggml_backend_cuda_unregister_host_buffer(lm_ggml_backend_buffer_get_base(buf));
2951
- }
2952
- #endif
2953
2934
  lm_ggml_backend_buffer_free(buf);
2954
2935
  }
2955
2936
  while (!lora_adapters.empty()) {
@@ -3314,12 +3295,8 @@ struct llama_context {
3314
3295
  std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
3315
3296
 
3316
3297
  std::vector<lm_ggml_backend_t> backends;
3317
- #ifdef LM_GGML_USE_METAL
3318
- lm_ggml_backend_t backend_metal = nullptr;
3319
- #endif
3320
- #ifdef LM_GGML_USE_BLAS
3321
- lm_ggml_backend_t backend_blas = nullptr;
3322
- #endif
3298
+ std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
3299
+
3323
3300
  lm_ggml_backend_t backend_cpu = nullptr;
3324
3301
 
3325
3302
  lm_ggml_threadpool_t threadpool = nullptr;
@@ -3435,72 +3412,112 @@ struct llama_lora_adapter {
3435
3412
  }
3436
3413
  };
3437
3414
 
3438
- static size_t llama_get_device_count(const llama_model & model) {
3439
- size_t count = 1;
3440
- #if defined(LM_GGML_USE_CUDA)
3441
- count = lm_ggml_backend_cuda_get_device_count();
3442
- #elif defined(LM_GGML_USE_SYCL)
3443
- count = lm_ggml_backend_sycl_get_device_count();
3415
+ static int llama_get_device_count(const llama_model & model) {
3416
+ int count = (int) model.devices.size();
3417
+
3418
+ #if defined(LM_GGML_USE_RPC)
3419
+ count += (int) model.rpc_servers.size();
3420
+ #endif
3421
+
3422
+ #if defined(LM_GGML_USE_SYCL)
3423
+ count += lm_ggml_backend_sycl_get_device_count();
3444
3424
  #elif defined(LM_GGML_USE_VULKAN)
3445
- count = lm_ggml_backend_vk_get_device_count();
3425
+ count += lm_ggml_backend_vk_get_device_count();
3446
3426
  #elif defined(LM_GGML_USE_CANN)
3447
- return lm_ggml_backend_cann_get_device_count();
3448
- #endif
3449
- #if defined(LM_GGML_USE_RPC)
3450
- count += model.rpc_servers.size();
3427
+ count += lm_ggml_backend_cann_get_device_count();
3451
3428
  #endif
3429
+
3452
3430
  return count;
3431
+
3453
3432
  LM_GGML_UNUSED(model);
3454
3433
  }
3455
3434
 
3456
- static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
3435
+ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) {
3457
3436
  lm_ggml_backend_buffer_type_t buft = nullptr;
3458
3437
 
3459
- #ifdef LM_GGML_USE_RPC
3460
- int rpc_count = (int)model.rpc_servers.size();
3461
- #else
3462
- int rpc_count = 0;
3438
+ if (host_buffer) {
3439
+ for (auto * dev : model.devices) {
3440
+ buft = lm_ggml_backend_dev_host_buffer_type(dev);
3441
+ if (buft != nullptr) {
3442
+ break;
3443
+ }
3444
+ }
3445
+ }
3446
+
3447
+ #if defined(LM_GGML_USE_SYCL)
3448
+ if (host_buffer) {
3449
+ buft = lm_ggml_backend_sycl_host_buffer_type();
3450
+ }
3451
+ #elif defined(LM_GGML_USE_CANN)
3452
+ if (host_buffer) {
3453
+ buft = lm_ggml_backend_cann_host_buffer_type();
3454
+ }
3455
+ #elif defined(LM_GGML_USE_CPU_HBM)
3456
+ buft = lm_ggml_backend_cpu_hbm_buffer_type();
3457
+ #elif defined(LM_GGML_USE_VULKAN)
3458
+ if (host_buffer) {
3459
+ buft = lm_ggml_backend_vk_host_buffer_type();
3460
+ }
3463
3461
  #endif
3464
- int local_gpu = gpu - rpc_count;
3462
+
3463
+ if (buft == nullptr) {
3464
+ buft = lm_ggml_backend_cpu_buffer_type();
3465
+ }
3466
+ return buft;
3467
+
3468
+ LM_GGML_UNUSED(host_buffer);
3469
+ }
3470
+
3471
+ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
3472
+ lm_ggml_backend_buffer_type_t buft = nullptr;
3473
+
3465
3474
  #if defined(LM_GGML_USE_RPC)
3466
- if (gpu < rpc_count) {
3467
- const char * endpoint = model.rpc_servers[gpu].c_str();
3475
+ int rpc_count = (int)model.rpc_servers.size();
3476
+ if (device < rpc_count) {
3477
+ const char * endpoint = model.rpc_servers[device].c_str();
3468
3478
  return lm_ggml_backend_rpc_buffer_type(endpoint);
3469
3479
  }
3480
+ device -= rpc_count;
3470
3481
  #endif
3471
- #if defined(LM_GGML_USE_METAL)
3472
- buft = lm_ggml_backend_metal_buffer_type();
3473
- #elif defined(LM_GGML_USE_CUDA)
3474
- buft = lm_ggml_backend_cuda_buffer_type(local_gpu);
3475
- #elif defined(LM_GGML_USE_VULKAN)
3476
- buft = lm_ggml_backend_vk_buffer_type(local_gpu);
3482
+
3483
+ if (device < (int)model.devices.size()) {
3484
+ return lm_ggml_backend_dev_buffer_type(model.devices[device]);
3485
+ }
3486
+ device -= (int)model.devices.size();
3487
+
3488
+ #if defined(LM_GGML_USE_VULKAN)
3489
+ buft = lm_ggml_backend_vk_buffer_type(device);
3477
3490
  #elif defined(LM_GGML_USE_SYCL)
3478
- buft = lm_ggml_backend_sycl_buffer_type(local_gpu);
3491
+ buft = lm_ggml_backend_sycl_buffer_type(device);
3479
3492
  #elif defined(LM_GGML_USE_KOMPUTE)
3480
- buft = lm_ggml_backend_kompute_buffer_type(local_gpu);
3481
- if (buft == nullptr) {
3482
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
3483
- }
3493
+ buft = lm_ggml_backend_kompute_buffer_type(device);
3484
3494
  #elif defined(LM_GGML_USE_CANN)
3485
- buft = lm_ggml_backend_cann_buffer_type(local_gpu);
3495
+ buft = lm_ggml_backend_cann_buffer_type(device);
3486
3496
  #endif
3487
3497
 
3488
3498
  if (buft == nullptr) {
3489
- buft = llama_default_buffer_type_cpu(true);
3499
+ buft = llama_default_buffer_type_cpu(model, true);
3490
3500
  }
3491
3501
  return buft;
3502
+
3492
3503
  LM_GGML_UNUSED(model);
3493
- LM_GGML_UNUSED(local_gpu);
3494
3504
  }
3495
3505
 
3496
3506
  static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
3497
3507
  lm_ggml_backend_buffer_type_t buft = nullptr;
3498
3508
 
3499
- #ifdef LM_GGML_USE_CUDA
3500
- if (lm_ggml_backend_cuda_get_device_count() > 1) {
3501
- buft = lm_ggml_backend_cuda_split_buffer_type(tensor_split);
3509
+ // find a backend that supports split buffers
3510
+ for (size_t i = 0; i < lm_ggml_backend_reg_count(); ++i) {
3511
+ lm_ggml_backend_reg_t reg = lm_ggml_backend_reg_get(i);
3512
+
3513
+ auto lm_ggml_backend_split_buffer_type_fn = (lm_ggml_backend_split_buffer_type_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_split_buffer_type");
3514
+ if (lm_ggml_backend_split_buffer_type_fn) {
3515
+ buft = lm_ggml_backend_split_buffer_type_fn(tensor_split);
3516
+ if (buft != nullptr) {
3517
+ break;
3518
+ }
3519
+ }
3502
3520
  }
3503
- #endif
3504
3521
 
3505
3522
  #ifdef LM_GGML_USE_SYCL
3506
3523
  if (lm_ggml_backend_sycl_get_device_count() > 1) {
@@ -3517,13 +3534,8 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
3517
3534
  }
3518
3535
 
3519
3536
  static size_t llama_get_device_memory(const llama_model & model, int device) {
3520
- #ifdef LM_GGML_USE_RPC
3521
- int rpc_count = (int)model.rpc_servers.size();
3522
- #else
3523
- int rpc_count = 0;
3524
- #endif
3525
- int local_device = device - rpc_count;
3526
3537
  #if defined(LM_GGML_USE_RPC)
3538
+ int rpc_count = (int)model.rpc_servers.size();
3527
3539
  if (device < rpc_count) {
3528
3540
  size_t total;
3529
3541
  size_t free;
@@ -3531,32 +3543,37 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
3531
3543
  lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
3532
3544
  return free;
3533
3545
  }
3546
+ device = device - rpc_count;
3534
3547
  #endif
3535
- #if defined(LM_GGML_USE_CUDA)
3536
- size_t total;
3537
- size_t free;
3538
- lm_ggml_backend_cuda_get_device_memory(local_device, &free, &total);
3539
- return free;
3540
- #elif defined(LM_GGML_USE_SYCL)
3548
+
3549
+ if (device < (int)model.devices.size()) {
3550
+ lm_ggml_backend_dev_t dev = model.devices[device];
3551
+ size_t total;
3552
+ size_t free;
3553
+ lm_ggml_backend_dev_memory(dev, &free, &total);
3554
+ return free;
3555
+ }
3556
+
3557
+ #if defined(LM_GGML_USE_SYCL)
3541
3558
  size_t total;
3542
3559
  size_t free;
3543
- lm_ggml_backend_sycl_get_device_memory(local_device, &free, &total);
3560
+ lm_ggml_backend_sycl_get_device_memory(device, &free, &total);
3544
3561
  return free;
3545
3562
  #elif defined(LM_GGML_USE_VULKAN)
3546
3563
  size_t total;
3547
3564
  size_t free;
3548
- lm_ggml_backend_vk_get_device_memory(local_device, &free, &total);
3565
+ lm_ggml_backend_vk_get_device_memory(device, &free, &total);
3549
3566
  return free;
3550
3567
  #elif defined(LM_GGML_USE_CANN)
3551
3568
  size_t total;
3552
3569
  size_t free;
3553
- lm_ggml_backend_cann_get_device_memory(local_device, &free, &total);
3570
+ lm_ggml_backend_cann_get_device_memory(device, &free, &total);
3554
3571
  return free;
3555
3572
  #else
3556
3573
  return 1;
3557
3574
  #endif
3558
3575
  LM_GGML_UNUSED(model);
3559
- LM_GGML_UNUSED(local_device);
3576
+ LM_GGML_UNUSED(device);
3560
3577
  }
3561
3578
 
3562
3579
  //
@@ -3599,7 +3616,7 @@ static bool llama_kv_cache_init(
3599
3616
  buft_layer_count[model.buft_layer[i].buft]++;
3600
3617
  }
3601
3618
  } else {
3602
- buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
3619
+ buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer;
3603
3620
  }
3604
3621
 
3605
3622
  // create a context for each buffer type
@@ -4891,7 +4908,7 @@ struct llama_model_loader {
4891
4908
  static const int TENSOR_NOT_REQUIRED = 1;
4892
4909
  static const int TENSOR_DUPLICATED = 2;
4893
4910
 
4894
- struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
4911
+ struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0) {
4895
4912
  const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
4896
4913
 
4897
4914
  if (cur == NULL) {
@@ -4901,7 +4918,7 @@ struct llama_model_loader {
4901
4918
  return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
4902
4919
  }
4903
4920
 
4904
- struct lm_ggml_tensor * create_tensor_as_view(struct lm_ggml_context * ctx, struct lm_ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
4921
+ struct lm_ggml_tensor * create_tensor_as_view(struct lm_ggml_context * ctx, struct lm_ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true) {
4905
4922
  const struct lm_ggml_tensor * cur = check_tensor_dims(name, ne, required);
4906
4923
 
4907
4924
  if (cur == NULL) {
@@ -4914,7 +4931,7 @@ struct llama_model_loader {
4914
4931
 
4915
4932
  std::array<int64_t, LM_GGML_MAX_DIMS> dims;
4916
4933
  for (size_t i = 0; i < LM_GGML_MAX_DIMS; ++i) {
4917
- dims[i] = i < ne.size() ? ne[i] : 1;
4934
+ dims[i] = i < ne.size() ? ne.begin()[i] : 1;
4918
4935
  }
4919
4936
 
4920
4937
  struct lm_ggml_tensor * tensor = lm_ggml_view_4d(ctx, base,
@@ -5012,7 +5029,7 @@ struct llama_model_loader {
5012
5029
  // Returns false if cancelled by progress_callback
5013
5030
  bool load_all_data(
5014
5031
  struct lm_ggml_context * ctx,
5015
- llama_buf_map & bufs_mmap,
5032
+ llama_buf_map & bufs,
5016
5033
  llama_mlocks * lmlocks,
5017
5034
  llama_progress_callback progress_callback,
5018
5035
  void * progress_callback_user_data) {
@@ -5021,43 +5038,94 @@ struct llama_model_loader {
5021
5038
  std::vector<no_init<uint8_t>> read_buf;
5022
5039
  std::vector<std::future<std::pair<lm_ggml_tensor *, bool>>> validation_result;
5023
5040
 
5024
- #if defined(LM_GGML_USE_CUDA)
5025
5041
  // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
5026
5042
  // NVMe raid configurations might require more / larger buffers.
5027
5043
  constexpr size_t n_buffers = 4;
5028
5044
  constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
5029
5045
 
5030
5046
  std::vector<lm_ggml_backend_buffer_t> host_buffers;
5031
- std::vector<void*> host_ptrs;
5032
5047
  std::vector<lm_ggml_backend_event_t> events;
5048
+ std::vector<void *> host_ptrs;
5033
5049
  size_t buffer_idx = 0; // buffer to use for async loads
5034
-
5035
- lm_ggml_backend_t cuda_backend = nullptr;
5036
- if (!use_mmap && !check_tensors) {
5050
+ lm_ggml_backend_t upload_backend = [&](const char * fn) -> lm_ggml_backend_t {
5051
+ if (use_mmap || check_tensors) {
5052
+ return nullptr;
5053
+ }
5037
5054
  // When not using mmaped io use async uploads from pinned memory to GPU memory.
5038
- // First determine if the CUDA backend is active, and if so, determine the device ID.
5039
- lm_ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
5040
- if (buf) {
5041
- lm_ggml_backend_buffer_type_t buffer_type = lm_ggml_backend_buffer_get_type(buf);
5042
- for (int i = 0; i < lm_ggml_backend_cuda_get_device_count(); ++i) {
5043
- auto * cuda_buffer_type = lm_ggml_backend_cuda_buffer_type(i);
5044
- if (buffer_type == cuda_buffer_type) {
5045
- cuda_backend = lm_ggml_backend_cuda_init(i);
5046
- break;
5047
- }
5048
- }
5055
+ // First determine if the backend supports the necessary features for async uploads.
5056
+ auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
5057
+ if (!buf) {
5058
+ LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
5059
+ return nullptr;
5060
+ }
5061
+
5062
+ auto * buft = lm_ggml_backend_buffer_get_type(buf);
5063
+ auto * dev = lm_ggml_backend_buft_get_device(buft);
5064
+ if (!dev) {
5065
+ LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
5066
+ lm_ggml_backend_buft_name(buft));
5067
+ return nullptr;
5068
+ }
5069
+
5070
+ if (buft != lm_ggml_backend_dev_buffer_type(dev)) {
5071
+ LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
5072
+ lm_ggml_backend_buft_name(buft), lm_ggml_backend_dev_name(dev));
5073
+ return nullptr;
5074
+ }
5075
+
5076
+ lm_ggml_backend_dev_props props;
5077
+ lm_ggml_backend_dev_get_props(dev, &props);
5078
+ if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
5079
+ LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
5080
+ lm_ggml_backend_dev_name(dev));
5081
+ return nullptr;
5049
5082
  }
5050
5083
 
5051
- // If the cuda backend is active create pinned memory buffers and events for synchronisation.
5052
- if (cuda_backend) {
5053
- for (size_t idx = 0; idx < n_buffers; ++idx) {
5054
- host_buffers.emplace_back(lm_ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
5055
- host_ptrs.emplace_back(lm_ggml_backend_buffer_get_base(host_buffers[idx]));
5056
- events.emplace_back(lm_ggml_backend_event_new(cuda_backend));
5084
+ auto * host_buft = lm_ggml_backend_dev_host_buffer_type(dev);
5085
+ if (!host_buft) {
5086
+ LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
5087
+ lm_ggml_backend_dev_name(dev));
5088
+ return nullptr;
5089
+ }
5090
+
5091
+ // If the backend is supported, create pinned memory buffers and events for synchronisation.
5092
+ for (size_t idx = 0; idx < n_buffers; ++idx) {
5093
+ auto * buf = lm_ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
5094
+ if (!buf) {
5095
+ LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
5096
+ lm_ggml_backend_dev_name(dev));
5097
+ return nullptr;
5098
+ }
5099
+
5100
+ host_buffers.emplace_back(buf);
5101
+ host_ptrs.emplace_back(lm_ggml_backend_buffer_get_base(buf));
5102
+
5103
+ auto * event = lm_ggml_backend_event_new(dev);
5104
+ if (!event) {
5105
+ LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
5106
+ lm_ggml_backend_dev_name(dev));
5107
+ return nullptr;
5057
5108
  }
5109
+
5110
+ events.emplace_back(event);
5111
+ }
5112
+
5113
+ lm_ggml_backend_t backend = lm_ggml_backend_dev_init(dev, nullptr);
5114
+ if (!backend) {
5115
+ LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
5116
+ lm_ggml_backend_dev_name(dev));
5117
+ return nullptr;
5058
5118
  }
5119
+
5120
+ return backend;
5121
+ }(__func__);
5122
+
5123
+ if (upload_backend) {
5124
+ LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
5125
+ lm_ggml_backend_dev_name(lm_ggml_backend_get_device(upload_backend)),
5126
+ lm_ggml_backend_buft_name(lm_ggml_backend_buffer_get_type(bufs.at(0))),
5127
+ lm_ggml_backend_name(upload_backend));
5059
5128
  }
5060
- #endif
5061
5129
 
5062
5130
  for (struct lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx); cur != NULL; cur = lm_ggml_get_next_tensor(ctx, cur)) {
5063
5131
  const auto * weight = get_weight(lm_ggml_get_name(cur));
@@ -5077,8 +5145,8 @@ struct llama_model_loader {
5077
5145
  if (use_mmap) {
5078
5146
  const auto & mapping = mappings.at(weight->idx);
5079
5147
  lm_ggml_backend_buffer_t buf_mmap = nullptr;
5080
- if (bufs_mmap.count(weight->idx)) {
5081
- buf_mmap = bufs_mmap.at(weight->idx);
5148
+ if (bufs.count(weight->idx)) {
5149
+ buf_mmap = bufs.at(weight->idx);
5082
5150
  }
5083
5151
  uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
5084
5152
 
@@ -5114,9 +5182,8 @@ struct llama_model_loader {
5114
5182
  }));
5115
5183
  }
5116
5184
  } else {
5117
- #if defined(LM_GGML_USE_CUDA)
5118
- // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
5119
- if (cuda_backend) {
5185
+ // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
5186
+ if (upload_backend) {
5120
5187
  file->seek(weight->offs, SEEK_SET);
5121
5188
 
5122
5189
  size_t bytes_read = 0;
@@ -5126,17 +5193,14 @@ struct llama_model_loader {
5126
5193
 
5127
5194
  lm_ggml_backend_event_synchronize(events[buffer_idx]);
5128
5195
  file->read_raw(host_ptrs[buffer_idx], read_iteration);
5129
- lm_ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
5130
- lm_ggml_backend_event_record(events[buffer_idx]);
5196
+ lm_ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
5197
+ lm_ggml_backend_event_record(events[buffer_idx], upload_backend);
5131
5198
 
5132
5199
  bytes_read += read_iteration;
5133
5200
  ++buffer_idx;
5134
5201
  buffer_idx %= n_buffers;
5135
5202
  }
5136
- }
5137
- else
5138
- #endif
5139
- {
5203
+ } else {
5140
5204
  read_buf.resize(n_size);
5141
5205
  file->seek(weight->offs, SEEK_SET);
5142
5206
  file->read_raw(read_buf.data(), n_size);
@@ -5151,17 +5215,15 @@ struct llama_model_loader {
5151
5215
  size_done += n_size;
5152
5216
  }
5153
5217
 
5154
- #if defined(LM_GGML_USE_CUDA)
5155
- // free temporary resources used for async cuda uploads
5156
- if (cuda_backend) {
5157
- for (size_t idx = 0; idx < n_buffers;++idx) {
5158
- lm_ggml_backend_event_synchronize(events[idx]);
5159
- lm_ggml_backend_event_free(events[idx]);
5160
- lm_ggml_backend_buffer_free(host_buffers[idx]);
5161
- }
5162
- lm_ggml_backend_free(cuda_backend);
5218
+ // free temporary resources used for async uploads
5219
+ for (auto * event : events) {
5220
+ lm_ggml_backend_event_synchronize(event);
5221
+ lm_ggml_backend_event_free(event);
5163
5222
  }
5164
- #endif
5223
+ for (auto * buf : host_buffers) {
5224
+ lm_ggml_backend_buffer_free(buf);
5225
+ }
5226
+ lm_ggml_backend_free(upload_backend);
5165
5227
 
5166
5228
  // check validation results
5167
5229
  bool validation_failed = false;
@@ -5477,8 +5539,10 @@ static void llm_load_hparams(
5477
5539
  }
5478
5540
  } else {
5479
5541
  switch (hparams.n_layer) {
5542
+ case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
5480
5543
  case 22: model.type = e_model::MODEL_1B; break;
5481
5544
  case 26: model.type = e_model::MODEL_3B; break;
5545
+ case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
5482
5546
  // granite uses a vocab with len 49152
5483
5547
  case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
5484
5548
  case 36: model.type = e_model::MODEL_8B; break; // granite
@@ -5591,11 +5655,11 @@ static void llm_load_hparams(
5591
5655
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5592
5656
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
5593
5657
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
5594
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
5658
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
5595
5659
  hparams.f_max_alibi_bias = 8.0f;
5596
5660
 
5597
5661
  switch (hparams.n_layer) {
5598
- case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
5662
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
5599
5663
  case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
5600
5664
  }
5601
5665
  } break;
@@ -6095,6 +6159,18 @@ static void llm_load_hparams(
6095
6159
  default: model.type = e_model::MODEL_UNKNOWN;
6096
6160
  }
6097
6161
  } break;
6162
+ case LLM_ARCH_CHAMELEON:
6163
+ {
6164
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6165
+ hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
6166
+ ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
6167
+
6168
+ switch (hparams.n_layer) {
6169
+ case 32: model.type = e_model::MODEL_7B; break;
6170
+ case 48: model.type = e_model::MODEL_34B; break;
6171
+ default: model.type = e_model::MODEL_UNKNOWN;
6172
+ }
6173
+ } break;
6098
6174
  default: (void)0;
6099
6175
  }
6100
6176
 
@@ -6288,6 +6364,7 @@ static void llm_load_vocab(
6288
6364
  tokenizer_pre == "phi-2" ||
6289
6365
  tokenizer_pre == "jina-es" ||
6290
6366
  tokenizer_pre == "jina-de" ||
6367
+ tokenizer_pre == "jina-v1-en" ||
6291
6368
  tokenizer_pre == "jina-v2-es" ||
6292
6369
  tokenizer_pre == "jina-v2-de" ||
6293
6370
  tokenizer_pre == "jina-v2-code") {
@@ -6352,6 +6429,11 @@ static void llm_load_vocab(
6352
6429
  } else if (
6353
6430
  tokenizer_pre == "exaone") {
6354
6431
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
6432
+ } else if (
6433
+ tokenizer_pre == "chameleon") {
6434
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
6435
+ vocab.tokenizer_add_bos = true;
6436
+ vocab.tokenizer_clean_spaces = false;
6355
6437
  } else {
6356
6438
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6357
6439
  }
@@ -6409,7 +6491,12 @@ static void llm_load_vocab(
6409
6491
 
6410
6492
  for (uint32_t i = 0; i < n_vocab; i++) {
6411
6493
  std::string word = lm_gguf_get_arr_str(ctx, token_idx, i);
6412
- LM_GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
6494
+
6495
+ //LM_GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
6496
+ if (word.empty()) {
6497
+ LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
6498
+ word = "[EMPTY_" + std::to_string(i) + "]";
6499
+ }
6413
6500
 
6414
6501
  vocab.token_to_id[word] = i;
6415
6502
  vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
@@ -6434,6 +6521,8 @@ static void llm_load_vocab(
6434
6521
  }
6435
6522
  LM_GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
6436
6523
 
6524
+ vocab.init_tokenizer();
6525
+
6437
6526
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
6438
6527
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
6439
6528
  // For Fill-In-the-Middle (FIM)/infill models which where converted
@@ -6488,8 +6577,14 @@ static void llm_load_vocab(
6488
6577
  vocab.linefeed_id = ids[0];
6489
6578
  } else {
6490
6579
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
6491
- LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
6492
- vocab.linefeed_id = ids[0];
6580
+
6581
+ //LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
6582
+ if (ids.empty()) {
6583
+ LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
6584
+ vocab.linefeed_id = vocab.special_pad_id;
6585
+ } else {
6586
+ vocab.linefeed_id = ids[0];
6587
+ }
6493
6588
  }
6494
6589
 
6495
6590
  // special tokens
@@ -6864,6 +6959,13 @@ static bool llm_load_tensors(
6864
6959
  void * progress_callback_user_data) {
6865
6960
  auto & hparams = model.hparams;
6866
6961
 
6962
+ // check if the value of main_gpu is valid
6963
+ if (llama_get_device_count(model) > 0 &&
6964
+ split_mode != LLAMA_SPLIT_MODE_LAYER &&
6965
+ (main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
6966
+ throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
6967
+ }
6968
+
6867
6969
  model.split_mode = split_mode;
6868
6970
  model.main_gpu = main_gpu;
6869
6971
  model.n_gpu_layers = n_gpu_layers;
@@ -6873,14 +6975,14 @@ static bool llm_load_tensors(
6873
6975
  bool use_mmap_buffer = true;
6874
6976
 
6875
6977
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
6876
- model.buft_input = llama_default_buffer_type_cpu(true);
6978
+ model.buft_input = llama_default_buffer_type_cpu(model, true);
6877
6979
  //model.buft_input = llama_default_buffer_type_offload(main_gpu);
6878
6980
 
6879
6981
  model.buft_layer.resize(n_layer);
6880
6982
 
6881
6983
  // assign cpu layers
6882
6984
  for (int i = 0; i < i_gpu_start; ++i) {
6883
- model.buft_layer[i] = llama_default_buffer_type_cpu(true);
6985
+ model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
6884
6986
  }
6885
6987
 
6886
6988
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@@ -6918,7 +7020,7 @@ static bool llm_load_tensors(
6918
7020
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
6919
7021
  model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
6920
7022
  } else {
6921
- model.buft_output = llama_default_buffer_type_cpu(true);
7023
+ model.buft_output = llama_default_buffer_type_cpu(model, true);
6922
7024
  }
6923
7025
  } else {
6924
7026
  lm_ggml_backend_buffer_type_t split_buft;
@@ -6942,7 +7044,7 @@ static bool llm_load_tensors(
6942
7044
  llama_default_buffer_type_offload(model, main_gpu)
6943
7045
  };
6944
7046
  } else {
6945
- model.buft_output = llama_default_buffer_type_cpu(true);
7047
+ model.buft_output = llama_default_buffer_type_cpu(model, true);
6946
7048
  }
6947
7049
  }
6948
7050
 
@@ -7362,6 +7464,12 @@ static bool llm_load_tensors(
7362
7464
 
7363
7465
  if (model.arch == LLM_ARCH_BERT) {
7364
7466
  model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
7467
+
7468
+ model.cls = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7469
+ model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
7470
+
7471
+ model.cls_out = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7472
+ model.cls_out_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7365
7473
  }
7366
7474
 
7367
7475
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
@@ -7414,6 +7522,8 @@ static bool llm_load_tensors(
7414
7522
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
7415
7523
  model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
7416
7524
 
7525
+ model.cls = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7526
+ model.cls_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_CLS, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
7417
7527
  for (int i = 0; i < n_layer; ++i) {
7418
7528
  lm_ggml_context * ctx_layer = ctx_for_layer(i);
7419
7529
  lm_ggml_context * ctx_split = ctx_for_layer_split(i);
@@ -8739,6 +8849,45 @@ static bool llm_load_tensors(
8739
8849
  }
8740
8850
 
8741
8851
  } break;
8852
+ case LLM_ARCH_CHAMELEON:
8853
+ {
8854
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
8855
+
8856
+ // output
8857
+ {
8858
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
8859
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
8860
+
8861
+ // if output is NULL, init from the input tok embed
8862
+ if (model.output == NULL) {
8863
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
8864
+ }
8865
+ }
8866
+
8867
+ for (int i = 0; i < n_layer; ++i) {
8868
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
8869
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
8870
+
8871
+ auto & layer = model.layers[i];
8872
+
8873
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
8874
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
8875
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
8876
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
8877
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
8878
+
8879
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
8880
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
8881
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
8882
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
8883
+
8884
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
8885
+
8886
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
8887
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
8888
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
8889
+ }
8890
+ } break;
8742
8891
  default:
8743
8892
  throw std::runtime_error("unknown architecture");
8744
8893
  }
@@ -8764,55 +8913,40 @@ static bool llm_load_tensors(
8764
8913
  llama_buf_map bufs;
8765
8914
  bufs.reserve(n_max_backend_buffer);
8766
8915
 
8767
- // only the mmap region containing the tensors in the model is mapped to the backend buffer
8768
- // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
8769
- // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
8770
- if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
8916
+ // check if this backend device supports buffer_from_host_ptr
8917
+ // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
8918
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_buft_get_device(buft == llama_default_buffer_type_cpu(model, true) ? lm_ggml_backend_cpu_buffer_type() : buft);
8919
+ bool buffer_from_host_ptr_supported = false;
8920
+ if (dev) {
8921
+ lm_ggml_backend_dev_props props;
8922
+ lm_ggml_backend_dev_get_props(dev, &props);
8923
+ buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
8924
+ }
8925
+
8926
+ if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) {
8771
8927
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
8928
+ // only the mmap region containing the tensors in the model is mapped to the backend buffer
8929
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
8930
+ // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
8772
8931
  void * addr = nullptr;
8773
- size_t first, last;
8932
+ size_t first, last; // NOLINT
8774
8933
  ml.get_mapping_range(&first, &last, &addr, idx, ctx);
8775
8934
  if (first >= last) {
8776
8935
  continue;
8777
8936
  }
8778
- lm_ggml_backend_buffer_t buf = lm_ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
8937
+ const size_t max_size = lm_ggml_get_max_tensor_size(ctx);
8938
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
8779
8939
  if (buf == nullptr) {
8780
- throw std::runtime_error("unable to allocate backend CPU buffer");
8940
+ throw std::runtime_error(format("unable to allocate %s buffer", lm_ggml_backend_buft_name(buft)));
8781
8941
  }
8782
8942
  model.bufs.push_back(buf);
8783
8943
  bufs.emplace(idx, buf);
8784
- #ifdef LM_GGML_USE_CUDA
8785
- if (n_layer >= n_gpu_layers) {
8786
- lm_ggml_backend_cuda_register_host_buffer(
8787
- lm_ggml_backend_buffer_get_base(buf),
8788
- lm_ggml_backend_buffer_get_size(buf));
8789
- }
8790
- #endif
8791
8944
  }
8792
8945
  }
8793
- #ifdef LM_GGML_USE_METAL
8794
- else if (ml.use_mmap && use_mmap_buffer && buft == lm_ggml_backend_metal_buffer_type()) {
8795
- for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
8796
- const size_t max_size = lm_ggml_get_max_tensor_size(ctx);
8797
- void * addr = nullptr;
8798
- size_t first, last;
8799
- ml.get_mapping_range(&first, &last, &addr, idx, ctx);
8800
- if (first >= last) {
8801
- continue;
8802
- }
8803
- lm_ggml_backend_buffer_t buf = lm_ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
8804
- if (buf == nullptr) {
8805
- throw std::runtime_error("unable to allocate backend metal buffer");
8806
- }
8807
- model.bufs.push_back(buf);
8808
- bufs.emplace(idx, buf);
8809
- }
8810
- }
8811
- #endif
8812
8946
  else {
8813
8947
  lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
8814
8948
  if (buf == nullptr) {
8815
- throw std::runtime_error("unable to allocate backend buffer");
8949
+ throw std::runtime_error(format("unable to allocate %s buffer", lm_ggml_backend_buft_name(buft)));
8816
8950
  }
8817
8951
  model.bufs.push_back(buf);
8818
8952
  if (use_mlock && lm_ggml_backend_buffer_is_host(buf)) {
@@ -10208,6 +10342,10 @@ struct llm_build_context {
10208
10342
  struct lm_ggml_tensor * cur;
10209
10343
 
10210
10344
  switch (pooling_type) {
10345
+ case LLAMA_POOLING_TYPE_NONE:
10346
+ {
10347
+ cur = inp;
10348
+ } break;
10211
10349
  case LLAMA_POOLING_TYPE_MEAN:
10212
10350
  {
10213
10351
  struct lm_ggml_tensor * inp_mean = build_inp_mean();
@@ -10219,9 +10357,26 @@ struct llm_build_context {
10219
10357
  struct lm_ggml_tensor * inp_cls = build_inp_cls();
10220
10358
  cur = lm_ggml_get_rows(ctx0, inp, inp_cls);
10221
10359
  } break;
10222
- case LLAMA_POOLING_TYPE_NONE:
10360
+ case LLAMA_POOLING_TYPE_RANK:
10223
10361
  {
10224
- cur = inp;
10362
+ struct lm_ggml_tensor * inp_cls = build_inp_cls();
10363
+ inp = lm_ggml_get_rows(ctx0, inp, inp_cls);
10364
+
10365
+ // classification head
10366
+ // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
10367
+ LM_GGML_ASSERT(model.cls != nullptr);
10368
+ LM_GGML_ASSERT(model.cls_b != nullptr);
10369
+
10370
+ cur = lm_ggml_add (ctx0, lm_ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
10371
+ cur = lm_ggml_tanh(ctx0, cur);
10372
+
10373
+ // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
10374
+ // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
10375
+ if (model.cls_out) {
10376
+ LM_GGML_ASSERT(model.cls_out_b != nullptr);
10377
+
10378
+ cur = lm_ggml_add (ctx0, lm_ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
10379
+ }
10225
10380
  } break;
10226
10381
  default:
10227
10382
  {
@@ -11450,8 +11605,8 @@ struct llm_build_context {
11450
11605
  inpL = cur;
11451
11606
  }
11452
11607
 
11453
- // final output
11454
11608
  cur = inpL;
11609
+
11455
11610
  cb(cur, "result_embd", -1);
11456
11611
 
11457
11612
  lm_ggml_build_forward_expand(gf, cur);
@@ -15883,6 +16038,184 @@ struct llm_build_context {
15883
16038
 
15884
16039
  return gf;
15885
16040
  }
16041
+
16042
+ // ref: https://github.com/facebookresearch/chameleon
16043
+ // based on the original build_llama() function, changes:
16044
+ // * qk-norm
16045
+ // * swin-norm
16046
+ // * removed bias
16047
+ // * removed MoE
16048
+ struct lm_ggml_cgraph * build_chameleon() {
16049
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
16050
+
16051
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
16052
+ int32_t n_tokens = this->n_tokens;
16053
+
16054
+ const int64_t n_embd_head = hparams.n_embd_head_v;
16055
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
16056
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
16057
+
16058
+ struct lm_ggml_tensor * cur;
16059
+ struct lm_ggml_tensor * inpL;
16060
+
16061
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
16062
+
16063
+ // inp_pos - contains the positions
16064
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
16065
+
16066
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
16067
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
16068
+
16069
+ for (int il = 0; il < n_layer; ++il) {
16070
+ struct lm_ggml_tensor * inpSA = inpL;
16071
+
16072
+ // norm
16073
+ if (hparams.swin_norm) {
16074
+ cur = inpL;
16075
+ } else {
16076
+ cur = llm_build_norm(ctx0, inpL, hparams,
16077
+ model.layers[il].attn_norm, NULL,
16078
+ LLM_NORM_RMS, cb, il);
16079
+ cb(cur, "attn_norm", il);
16080
+ }
16081
+
16082
+ // self-attention
16083
+ {
16084
+ // compute Q and K and RoPE them
16085
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
16086
+ cb(Qcur, "Qcur", il);
16087
+
16088
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
16089
+ cb(Kcur, "Kcur", il);
16090
+
16091
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
16092
+ cb(Vcur, "Vcur", il);
16093
+
16094
+ if (model.layers[il].attn_q_norm) {
16095
+ Qcur = lm_ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
16096
+ lm_ggml_element_size(Qcur) * n_embd_head,
16097
+ lm_ggml_element_size(Qcur) * n_embd_head * n_head,
16098
+ 0);
16099
+ cb(Qcur, "Qcur", il);
16100
+
16101
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
16102
+ model.layers[il].attn_q_norm,
16103
+ model.layers[il].attn_q_norm_b,
16104
+ LLM_NORM, cb, il);
16105
+ cb(Qcur, "Qcur", il);
16106
+ }
16107
+
16108
+ if (model.layers[il].attn_k_norm) {
16109
+ Kcur = lm_ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
16110
+ lm_ggml_element_size(Kcur) * n_embd_head,
16111
+ lm_ggml_element_size(Kcur) * n_embd_head * n_head_kv,
16112
+ 0);
16113
+ cb(Kcur, "Kcur", il);
16114
+
16115
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
16116
+ model.layers[il].attn_k_norm,
16117
+ model.layers[il].attn_k_norm_b,
16118
+ LLM_NORM, cb, il);
16119
+ cb(Kcur, "Kcur", il);
16120
+ }
16121
+
16122
+ Qcur = lm_ggml_rope_ext(
16123
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
16124
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16125
+ ext_factor, attn_factor, beta_fast, beta_slow
16126
+ );
16127
+ cb(Qcur, "Qcur", il);
16128
+
16129
+ Kcur = lm_ggml_rope_ext(
16130
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
16131
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16132
+ ext_factor, attn_factor, beta_fast, beta_slow
16133
+ );
16134
+ cb(Kcur, "Kcur", il);
16135
+
16136
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
16137
+ model.layers[il].wo, nullptr,
16138
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
16139
+
16140
+ if (hparams.swin_norm) {
16141
+ cur = llm_build_norm(ctx0, cur, hparams,
16142
+ model.layers[il].attn_norm, NULL,
16143
+ LLM_NORM_RMS, cb, il);
16144
+ }
16145
+ }
16146
+
16147
+ if (il == n_layer - 1) {
16148
+ // skip computing output for unused tokens
16149
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
16150
+ n_tokens = n_outputs;
16151
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
16152
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
16153
+ }
16154
+
16155
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
16156
+ cb(ffn_inp, "ffn_inp", il);
16157
+
16158
+ // feed-forward network
16159
+ if (!hparams.swin_norm) {
16160
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
16161
+ model.layers[il].ffn_norm, NULL,
16162
+ LLM_NORM_RMS, cb, il);
16163
+ cb(cur, "ffn_norm", il);
16164
+ }
16165
+
16166
+ cur = llm_build_ffn(ctx0, lctx, cur,
16167
+ model.layers[il].ffn_up, NULL, NULL,
16168
+ model.layers[il].ffn_gate, NULL, NULL,
16169
+ model.layers[il].ffn_down, NULL, NULL,
16170
+ NULL,
16171
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
16172
+ cb(cur, "ffn_out", il);
16173
+
16174
+ if (hparams.swin_norm) {
16175
+ cur = llm_build_norm(ctx0, cur, hparams,
16176
+ model.layers[il].ffn_norm, NULL,
16177
+ LLM_NORM_RMS, cb, il);
16178
+ cb(cur, "ffn_norm", il);
16179
+ }
16180
+
16181
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
16182
+ cb(cur, "ffn_out", il);
16183
+
16184
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
16185
+ cb(cur, "l_out", il);
16186
+
16187
+ // input for next layer
16188
+ inpL = cur;
16189
+ }
16190
+
16191
+ cur = inpL;
16192
+
16193
+ cur = llm_build_norm(ctx0, cur, hparams,
16194
+ model.output_norm, NULL,
16195
+ LLM_NORM_RMS, cb, -1);
16196
+ cb(cur, "result_norm", -1);
16197
+
16198
+ // lm_head
16199
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
16200
+ cb(cur, "result_output_with_img_logits", -1);
16201
+
16202
+ // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
16203
+ // Needs to be removed once image outputs are supported.
16204
+ int img_token_end_idx = 8196;
16205
+ int img_token_start_idx = 4;
16206
+ int num_img_tokens = img_token_end_idx - img_token_start_idx;
16207
+ // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
16208
+ // which ensures that text token values are always at least larger than image token values
16209
+ struct lm_ggml_tensor * img_logits = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, num_img_tokens);
16210
+ img_logits = lm_ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
16211
+ cb(img_logits, "img_logits", -1);
16212
+ cur = lm_ggml_set_1d(ctx0, cur, img_logits, lm_ggml_element_size(cur) * img_token_start_idx);
16213
+ cb(cur, "result_output", -1);
16214
+
16215
+ lm_ggml_build_forward_expand(gf, cur);
16216
+
16217
+ return gf;
16218
+ }
15886
16219
  };
15887
16220
 
15888
16221
  static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16143,6 +16476,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
16143
16476
  {
16144
16477
  result = llm.build_rwkv6();
16145
16478
  } break;
16479
+ case LLM_ARCH_CHAMELEON:
16480
+ {
16481
+ result = llm.build_chameleon();
16482
+ } break;
16146
16483
  default:
16147
16484
  LM_GGML_ABORT("fatal error");
16148
16485
  }
@@ -16429,7 +16766,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
16429
16766
  }
16430
16767
  }
16431
16768
 
16432
- if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
16769
+ if (cparams.embeddings && (
16770
+ cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
16771
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
16433
16772
  const int64_t n_tokens = batch.n_tokens;
16434
16773
  const int64_t n_seq_tokens = batch.n_seq_tokens;
16435
16774
  const int64_t n_seqs = batch.n_seqs;
@@ -16444,7 +16783,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
16444
16783
  const llama_seq_id seq_id = batch.seq_id[s][0];
16445
16784
 
16446
16785
  // TODO: adapt limits to n_seqs when batch.equal_seqs is true
16447
- LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
16786
+ LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
16448
16787
 
16449
16788
  for (int i = 0; i < n_seq_tokens; ++i) {
16450
16789
  const llama_pos pos = batch.pos[s*n_seq_tokens + i];
@@ -16646,7 +16985,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
16646
16985
  lctx.embd = nullptr;
16647
16986
  }
16648
16987
 
16649
- lctx.buf_output = lm_ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
16988
+ lctx.buf_output = lm_ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
16650
16989
  if (lctx.buf_output == nullptr) {
16651
16990
  LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
16652
16991
  return 0;
@@ -16715,24 +17054,20 @@ static void llama_graph_compute(
16715
17054
  lm_ggml_cgraph * gf,
16716
17055
  int n_threads,
16717
17056
  lm_ggml_threadpool * threadpool) {
16718
- #ifdef LM_GGML_USE_METAL
16719
- if (lm_ggml_backend_is_metal(lctx.backend_metal)) {
16720
- lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
16721
- }
16722
- #endif
16723
-
16724
17057
  if (lctx.backend_cpu != nullptr) {
16725
- lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
16726
17058
  lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
16727
17059
  lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
16728
17060
  }
16729
- #ifdef LM_GGML_USE_BLAS
16730
- if (lctx.backend_blas != nullptr) {
16731
- lm_ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
17061
+
17062
+ // set the number of threads for all the backends
17063
+ for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
17064
+ set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
16732
17065
  }
16733
- #endif
16734
17066
 
16735
- lm_ggml_backend_sched_graph_compute_async(lctx.sched, gf);
17067
+ auto err = lm_ggml_backend_sched_graph_compute_async(lctx.sched, gf);
17068
+ if (err != LM_GGML_STATUS_SUCCESS) {
17069
+ LLAMA_LOG_ERROR("%s: lm_ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
17070
+ }
16736
17071
 
16737
17072
  // fprintf(stderr, "splits: %d\n", lm_ggml_backend_sched_get_n_splits(lctx.sched));
16738
17073
  }
@@ -16984,6 +17319,20 @@ static int llama_decode_internal(
16984
17319
  lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
16985
17320
  }
16986
17321
  } break;
17322
+ case LLAMA_POOLING_TYPE_RANK:
17323
+ {
17324
+ // extract the rerank score - a single float per sequence
17325
+ auto & embd_seq_out = lctx.embd_seq;
17326
+
17327
+ for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
17328
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
17329
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
17330
+ continue;
17331
+ }
17332
+ embd_seq_out[seq_id].resize(1);
17333
+ lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
17334
+ }
17335
+ } break;
16987
17336
  case LLAMA_POOLING_TYPE_UNSPECIFIED:
16988
17337
  {
16989
17338
  LM_GGML_ABORT("unknown pooling type");
@@ -17190,6 +17539,13 @@ static int llama_encode_internal(
17190
17539
  lm_ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
17191
17540
  }
17192
17541
  } break;
17542
+ case LLAMA_POOLING_TYPE_RANK:
17543
+ {
17544
+ // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
17545
+ // wait for an encoder model that requires this pooling type in order to test it
17546
+ // https://github.com/ggerganov/llama.cpp/pull/9510
17547
+ LM_GGML_ABORT("RANK pooling not implemented yet");
17548
+ }
17193
17549
  case LLAMA_POOLING_TYPE_UNSPECIFIED:
17194
17550
  {
17195
17551
  LM_GGML_ABORT("unknown pooling type");
@@ -17527,10 +17883,9 @@ static void llama_tensor_dequantize_internal(
17527
17883
  }
17528
17884
  float * f32_output = (float *) output.data();
17529
17885
 
17530
- lm_ggml_type_traits_t qtype;
17886
+ const lm_ggml_type_traits * qtype = lm_ggml_get_type_traits(tensor->type);
17531
17887
  if (lm_ggml_is_quantized(tensor->type)) {
17532
- qtype = lm_ggml_internal_get_type_traits(tensor->type);
17533
- if (qtype.to_float == NULL) {
17888
+ if (qtype->to_float == NULL) {
17534
17889
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", lm_ggml_type_name(tensor->type)));
17535
17890
  }
17536
17891
  } else if (tensor->type != LM_GGML_TYPE_F16 &&
@@ -17544,7 +17899,7 @@ static void llama_tensor_dequantize_internal(
17544
17899
  } else if (tensor->type == LM_GGML_TYPE_BF16) {
17545
17900
  lm_ggml_bf16_to_fp32_row((lm_ggml_bf16_t *)tensor->data, f32_output, nelements);
17546
17901
  } else if (lm_ggml_is_quantized(tensor->type)) {
17547
- qtype.to_float(tensor->data, f32_output, nelements);
17902
+ qtype->to_float(tensor->data, f32_output, nelements);
17548
17903
  } else {
17549
17904
  LM_GGML_ABORT("fatal error"); // unreachable
17550
17905
  }
@@ -17580,7 +17935,7 @@ static void llama_tensor_dequantize_internal(
17580
17935
  } else if (typ == LM_GGML_TYPE_BF16) {
17581
17936
  lm_ggml_bf16_to_fp32_row((lm_ggml_bf16_t *)inbuf, outbuf, nels);
17582
17937
  } else {
17583
- qtype.to_float(inbuf, outbuf, nels);
17938
+ qtype->to_float(inbuf, outbuf, nels);
17584
17939
  }
17585
17940
  };
17586
17941
  workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
@@ -18662,21 +19017,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
18662
19017
  }
18663
19018
 
18664
19019
  size_t llama_max_devices(void) {
18665
- #if defined(LM_GGML_USE_RPC)
18666
- return LM_GGML_RPC_MAX_SERVERS;
18667
- #elif defined(LM_GGML_USE_METAL)
18668
- return 1;
18669
- #elif defined(LM_GGML_USE_CUDA)
18670
- return LM_GGML_CUDA_MAX_DEVICES;
18671
- #elif defined(LM_GGML_USE_SYCL)
18672
- return LM_GGML_SYCL_MAX_DEVICES;
18673
- #elif defined(LM_GGML_USE_VULKAN)
18674
- return LM_GGML_VK_MAX_DEVICES;
18675
- #elif defined(LM_GGML_USE_CANN)
18676
- return LM_GGML_CANN_MAX_DEVICES;
18677
- #else
18678
- return 1;
18679
- #endif
19020
+ return 16;
18680
19021
  }
18681
19022
 
18682
19023
  bool llama_supports_mmap(void) {
@@ -18688,12 +19029,13 @@ bool llama_supports_mlock(void) {
18688
19029
  }
18689
19030
 
18690
19031
  bool llama_supports_gpu_offload(void) {
18691
- #if defined(LM_GGML_USE_CUDA) || defined(LM_GGML_USE_METAL) || defined(LM_GGML_USE_VULKAN) || \
19032
+ #if defined(LM_GGML_USE_VULKAN) || \
18692
19033
  defined(LM_GGML_USE_SYCL) || defined(LM_GGML_USE_KOMPUTE) || defined(LM_GGML_USE_RPC)
18693
19034
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
18694
19035
  return true;
18695
19036
  #else
18696
- return false;
19037
+ return lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
19038
+ lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
18697
19039
  #endif
18698
19040
  }
18699
19041
 
@@ -18758,17 +19100,37 @@ struct llama_model * llama_load_model_from_file(
18758
19100
  return true;
18759
19101
  };
18760
19102
  }
19103
+
18761
19104
  if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
18762
19105
  // split the servers set them into model->rpc_servers
18763
19106
  std::string servers(params.rpc_servers);
18764
19107
  size_t pos = 0;
18765
- while ((pos = servers.find(",")) != std::string::npos) {
19108
+ while ((pos = servers.find(',')) != std::string::npos) {
18766
19109
  std::string server = servers.substr(0, pos);
18767
19110
  model->rpc_servers.push_back(server);
18768
19111
  servers.erase(0, pos + 1);
18769
19112
  }
18770
19113
  model->rpc_servers.push_back(servers);
18771
19114
  }
19115
+
19116
+ // create list of devices to use with this model
19117
+ // currently, we use all available devices
19118
+ // TODO: rework API to give user more control over device selection
19119
+ for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
19120
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
19121
+ switch (lm_ggml_backend_dev_type(dev)) {
19122
+ case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
19123
+ case LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL:
19124
+ // skip CPU backends since they are `handled separately
19125
+ break;
19126
+
19127
+ case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
19128
+ case LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
19129
+ model->devices.push_back(dev);
19130
+ break;
19131
+ }
19132
+ }
19133
+
18772
19134
  int status = llama_model_load(path_model, *model, params);
18773
19135
  LM_GGML_ASSERT(status <= 0);
18774
19136
  if (status < 0) {
@@ -18930,60 +19292,61 @@ struct llama_context * llama_new_context_with_model(
18930
19292
 
18931
19293
  if (!hparams.vocab_only) {
18932
19294
  // initialize backends
18933
- #if defined(LM_GGML_USE_RPC)
18934
- if (model->n_gpu_layers > 0) {
18935
- for (const auto & endpoint : model->rpc_servers) {
18936
- lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
19295
+ int main_gpu = model->main_gpu;
19296
+
19297
+ // with registry
19298
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19299
+ if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
19300
+ lm_ggml_backend_dev_t main_dev = model->devices[main_gpu];
19301
+ lm_ggml_backend_t backend = lm_ggml_backend_dev_init(main_dev, nullptr);
18937
19302
  if (backend == nullptr) {
18938
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
19303
+ LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, lm_ggml_backend_dev_name(main_dev));
19304
+ llama_free(ctx);
19305
+ return nullptr;
19306
+ }
19307
+ ctx->backends.push_back(backend);
19308
+ }
19309
+ } else {
19310
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19311
+ for (auto * dev : model->devices) {
19312
+ lm_ggml_backend_t backend = lm_ggml_backend_dev_init(dev, nullptr);
19313
+ if (backend == nullptr) {
19314
+ LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, lm_ggml_backend_dev_name(dev));
18939
19315
  llama_free(ctx);
18940
19316
  return nullptr;
18941
19317
  }
18942
19318
  ctx->backends.push_back(backend);
18943
19319
  }
18944
19320
  }
18945
- #endif
19321
+ if (main_gpu >= (int)model->devices.size()) {
19322
+ main_gpu -= (int)model->devices.size();
19323
+ }
18946
19324
 
18947
- #if defined(LM_GGML_USE_METAL)
19325
+ #if defined(LM_GGML_USE_RPC)
18948
19326
  if (model->n_gpu_layers > 0) {
18949
- ctx->backend_metal = lm_ggml_backend_metal_init();
18950
- if (ctx->backend_metal == nullptr) {
18951
- LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
18952
- llama_free(ctx);
18953
- return nullptr;
18954
- }
18955
- ctx->backends.push_back(ctx->backend_metal);
18956
- }
18957
- #elif defined(LM_GGML_USE_CUDA)
18958
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
18959
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
18960
- lm_ggml_backend_t backend = lm_ggml_backend_cuda_init(model->main_gpu);
18961
- if (backend == nullptr) {
18962
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
18963
- llama_free(ctx);
18964
- return nullptr;
18965
- }
18966
- ctx->backends.push_back(backend);
18967
- } else {
18968
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
18969
- for (int device = 0; device < lm_ggml_backend_cuda_get_device_count(); ++device) {
18970
- lm_ggml_backend_t backend = lm_ggml_backend_cuda_init(device);
19327
+ for (const auto & endpoint : model->rpc_servers) {
19328
+ lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
18971
19329
  if (backend == nullptr) {
18972
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
19330
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18973
19331
  llama_free(ctx);
18974
19332
  return nullptr;
18975
19333
  }
18976
19334
  ctx->backends.push_back(backend);
18977
19335
  }
18978
19336
  }
18979
- #elif defined(LM_GGML_USE_VULKAN)
19337
+ if (main_gpu >= (int)model->rpc_servers.size()) {
19338
+ main_gpu -= (int)model->rpc_servers.size();
19339
+ }
19340
+ #endif
19341
+
19342
+ #if defined(LM_GGML_USE_VULKAN)
18980
19343
  if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
18981
19344
  LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
18982
19345
  llama_free(ctx);
18983
19346
  return nullptr;
18984
19347
  }
18985
19348
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
18986
- lm_ggml_backend_t backend = lm_ggml_backend_vk_init(model->main_gpu);
19349
+ lm_ggml_backend_t backend = lm_ggml_backend_vk_init(main_gpu);
18987
19350
  if (backend == nullptr) {
18988
19351
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
18989
19352
  llama_free(ctx);
@@ -19004,9 +19367,9 @@ struct llama_context * llama_new_context_with_model(
19004
19367
  #elif defined(LM_GGML_USE_SYCL)
19005
19368
  // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19006
19369
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19007
- lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(model->main_gpu);
19370
+ lm_ggml_backend_t backend = lm_ggml_backend_sycl_init(main_gpu);
19008
19371
  if (backend == nullptr) {
19009
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
19372
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
19010
19373
  llama_free(ctx);
19011
19374
  return nullptr;
19012
19375
  }
@@ -19025,7 +19388,7 @@ struct llama_context * llama_new_context_with_model(
19025
19388
  }
19026
19389
  #elif defined(LM_GGML_USE_KOMPUTE)
19027
19390
  if (model->n_gpu_layers > 0) {
19028
- auto * backend = lm_ggml_backend_kompute_init(model->main_gpu);
19391
+ auto * backend = lm_ggml_backend_kompute_init(main_gpu);
19029
19392
  if (backend == nullptr) {
19030
19393
  LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
19031
19394
  llama_free(ctx);
@@ -19034,39 +19397,44 @@ struct llama_context * llama_new_context_with_model(
19034
19397
  ctx->backends.push_back(backend);
19035
19398
  }
19036
19399
  #elif defined(LM_GGML_USE_CANN)
19037
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19038
- // TODO: lm_ggml_backend_cann is not support split tensor now, just leave code here.
19039
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19040
- lm_ggml_backend_t backend = lm_ggml_backend_cann_init(model->main_gpu);
19041
- if (backend == nullptr) {
19042
- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
19043
- llama_free(ctx);
19044
- return nullptr;
19045
- }
19046
- ctx->backends.push_back(backend);
19047
- } else {
19048
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19049
- // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19050
- for (int32_t device = 0; device < lm_ggml_backend_cann_get_device_count(); ++device) {
19051
- lm_ggml_backend_t backend = lm_ggml_backend_cann_init(device);
19400
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19401
+ // TODO: lm_ggml_backend_cann is not support split tensor now, just leave code here.
19402
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19403
+ lm_ggml_backend_t backend = lm_ggml_backend_cann_init(main_gpu);
19052
19404
  if (backend == nullptr) {
19053
- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19405
+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
19054
19406
  llama_free(ctx);
19055
19407
  return nullptr;
19056
19408
  }
19057
19409
  ctx->backends.push_back(backend);
19410
+ } else {
19411
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19412
+ // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19413
+ for (int32_t device = 0; device < lm_ggml_backend_cann_get_device_count(); ++device) {
19414
+ lm_ggml_backend_t backend = lm_ggml_backend_cann_init(device);
19415
+ if (backend == nullptr) {
19416
+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19417
+ llama_free(ctx);
19418
+ return nullptr;
19419
+ }
19420
+ ctx->backends.push_back(backend);
19421
+ }
19058
19422
  }
19059
- }
19060
19423
  #endif
19061
19424
 
19062
- #ifdef LM_GGML_USE_BLAS
19063
- ctx->backend_blas = lm_ggml_backend_blas_init();
19064
- if (ctx->backend_blas == nullptr) {
19065
- LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
19066
- } else {
19067
- ctx->backends.push_back(ctx->backend_blas);
19425
+ // add other backends (such as BLAS)
19426
+ for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
19427
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
19428
+ if (lm_ggml_backend_dev_type(dev) == LM_GGML_BACKEND_DEVICE_TYPE_CPU) {
19429
+ lm_ggml_backend_t backend = lm_ggml_backend_dev_init(dev, nullptr);
19430
+ if (backend == nullptr) {
19431
+ LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, lm_ggml_backend_dev_name(dev));
19432
+ llama_free(ctx);
19433
+ return nullptr;
19434
+ }
19435
+ ctx->backends.push_back(backend);
19436
+ }
19068
19437
  }
19069
- #endif
19070
19438
 
19071
19439
  ctx->backend_cpu = lm_ggml_backend_cpu_init();
19072
19440
  if (ctx->backend_cpu == nullptr) {
@@ -19076,6 +19444,18 @@ struct llama_context * llama_new_context_with_model(
19076
19444
  }
19077
19445
  ctx->backends.push_back(ctx->backend_cpu);
19078
19446
 
19447
+ // create a list of the set_n_threads functions in the backends
19448
+ for (auto * backend : ctx->backends) {
19449
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_get_device(backend);
19450
+ lm_ggml_backend_reg_t reg = dev ? lm_ggml_backend_dev_backend_reg(dev) : nullptr;
19451
+ if (reg) {
19452
+ auto lm_ggml_backend_set_n_threads_fn = (lm_ggml_backend_set_n_threads_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_set_n_threads");
19453
+ if (lm_ggml_backend_set_n_threads_fn) {
19454
+ ctx->set_n_threads_fns.emplace_back(backend, lm_ggml_backend_set_n_threads_fn);
19455
+ }
19456
+ }
19457
+ }
19458
+
19079
19459
  if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
19080
19460
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
19081
19461
  llama_free(ctx);
@@ -19121,7 +19501,7 @@ struct llama_context * llama_new_context_with_model(
19121
19501
  for (auto * backend : ctx->backends) {
19122
19502
  if (lm_ggml_backend_is_cpu(backend)) {
19123
19503
  // use host buffers for the CPU backend compute buffer
19124
- backend_buft.push_back(llama_default_buffer_type_cpu(true));
19504
+ backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
19125
19505
  } else {
19126
19506
  backend_buft.push_back(lm_ggml_backend_get_default_buffer_type(backend));
19127
19507
  }
@@ -19132,17 +19512,37 @@ struct llama_context * llama_new_context_with_model(
19132
19512
  // buffer used to store the computation graph and the tensor meta data
19133
19513
  ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*max_nodes + lm_ggml_graph_overhead_custom(max_nodes, false));
19134
19514
 
19515
+ // TODO: move these checks to lm_ggml_backend_sched
19135
19516
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
19136
19517
  bool pipeline_parallel =
19137
19518
  llama_get_device_count(*model) > 1 &&
19138
19519
  model->n_gpu_layers > (int)model->hparams.n_layer &&
19139
19520
  model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
19140
19521
  params.offload_kqv;
19141
- #ifndef LM_GGML_USE_CUDA
19142
- // pipeline parallelism requires support for async compute and events
19143
- // currently this is only implemented in the CUDA backend
19144
- pipeline_parallel = false;
19145
- #endif
19522
+
19523
+ // pipeline parallelism requires support for async compute and events in all devices
19524
+ if (pipeline_parallel) {
19525
+ for (auto * backend : ctx->backends) {
19526
+ if (lm_ggml_backend_is_cpu(backend)) {
19527
+ // ignore CPU backend
19528
+ continue;
19529
+ }
19530
+ auto * dev = lm_ggml_backend_get_device(backend);
19531
+ if (!dev) {
19532
+ // backend is using old interface, not supported
19533
+ pipeline_parallel = false;
19534
+ break;
19535
+ }
19536
+ lm_ggml_backend_dev_props props;
19537
+ lm_ggml_backend_dev_get_props(dev, &props);
19538
+ if (!props.caps.async || !props.caps.events) {
19539
+ // device does not support async compute or events
19540
+ pipeline_parallel = false;
19541
+ break;
19542
+ }
19543
+ }
19544
+ }
19545
+
19146
19546
  ctx->sched = lm_ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
19147
19547
 
19148
19548
  if (pipeline_parallel) {
@@ -19268,6 +19668,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19268
19668
  case LLM_ARCH_CHATGLM:
19269
19669
  case LLM_ARCH_GRANITE:
19270
19670
  case LLM_ARCH_GRANITE_MOE:
19671
+ case LLM_ARCH_CHAMELEON:
19271
19672
  return LLAMA_ROPE_TYPE_NORM;
19272
19673
 
19273
19674
  // the pairs of head values are offset by n_rot/2
@@ -21446,15 +21847,9 @@ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_inter
21446
21847
  }
21447
21848
 
21448
21849
  void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) {
21449
- g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
21450
- g_state.log_callback_user_data = user_data;
21451
- #ifdef LM_GGML_USE_METAL
21452
- lm_ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21453
- #elif defined(LM_GGML_USE_CUDA)
21454
- lm_ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21455
- #elif defined(LM_GGML_USE_CANN)
21456
- lm_ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21457
- #endif
21850
+ lm_ggml_log_set(log_callback, user_data);
21851
+ g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
21852
+ g_logger_state.log_callback_user_data = user_data;
21458
21853
  }
21459
21854
 
21460
21855
  static void llama_log_internal_v(lm_ggml_log_level level, const char * format, va_list args) {
@@ -21463,12 +21858,12 @@ static void llama_log_internal_v(lm_ggml_log_level level, const char * format, v
21463
21858
  char buffer[128];
21464
21859
  int len = vsnprintf(buffer, 128, format, args);
21465
21860
  if (len < 128) {
21466
- g_state.log_callback(level, buffer, g_state.log_callback_user_data);
21861
+ g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
21467
21862
  } else {
21468
21863
  char * buffer2 = new char[len + 1];
21469
21864
  vsnprintf(buffer2, len + 1, format, args_copy);
21470
21865
  buffer2[len] = 0;
21471
- g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
21866
+ g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
21472
21867
  delete[] buffer2;
21473
21868
  }
21474
21869
  va_end(args_copy);