llama_cpp 0.12.3 → 0.12.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,12 @@
11
11
  # include "ggml-cuda.h"
12
12
  #elif defined(GGML_USE_CLBLAST)
13
13
  # include "ggml-opencl.h"
14
+ #elif defined(GGML_USE_VULKAN)
15
+ # include "ggml-vulkan.h"
16
+ #elif defined(GGML_USE_SYCL)
17
+ # include "ggml-sycl.h"
18
+ #elif defined(GGML_USE_KOMPUTE)
19
+ # include "ggml-kompute.h"
14
20
  #endif
15
21
 
16
22
  #ifdef GGML_USE_METAL
@@ -52,6 +58,7 @@
52
58
  #include <algorithm>
53
59
  #include <array>
54
60
  #include <cassert>
61
+ #include <cfloat>
55
62
  #include <cinttypes>
56
63
  #include <climits>
57
64
  #include <cmath>
@@ -196,10 +203,13 @@ enum llm_arch {
196
203
  LLM_ARCH_PHI2,
197
204
  LLM_ARCH_PLAMO,
198
205
  LLM_ARCH_CODESHELL,
206
+ LLM_ARCH_ORION,
207
+ LLM_ARCH_INTERNLM2,
208
+ LLM_ARCH_MINICPM,
199
209
  LLM_ARCH_UNKNOWN,
200
210
  };
201
211
 
202
- static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
212
+ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
203
213
  { LLM_ARCH_LLAMA, "llama" },
204
214
  { LLM_ARCH_FALCON, "falcon" },
205
215
  { LLM_ARCH_GPT2, "gpt2" },
@@ -217,6 +227,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
217
227
  { LLM_ARCH_PHI2, "phi2" },
218
228
  { LLM_ARCH_PLAMO, "plamo" },
219
229
  { LLM_ARCH_CODESHELL, "codeshell" },
230
+ { LLM_ARCH_ORION, "orion" },
231
+ { LLM_ARCH_INTERNLM2, "internlm2" },
232
+ { LLM_ARCH_MINICPM, "minicpm" },
220
233
  };
221
234
 
222
235
  enum llm_kv {
@@ -269,11 +282,12 @@ enum llm_kv {
269
282
  LLM_KV_TOKENIZER_PAD_ID,
270
283
  LLM_KV_TOKENIZER_ADD_BOS,
271
284
  LLM_KV_TOKENIZER_ADD_EOS,
285
+ LLM_KV_TOKENIZER_ADD_PREFIX,
272
286
  LLM_KV_TOKENIZER_HF_JSON,
273
287
  LLM_KV_TOKENIZER_RWKV,
274
288
  };
275
289
 
276
- static std::map<llm_kv, std::string> LLM_KV_NAMES = {
290
+ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
277
291
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
278
292
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
279
293
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -323,6 +337,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
323
337
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
324
338
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
325
339
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
340
+ { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
326
341
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
327
342
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
328
343
  };
@@ -333,7 +348,7 @@ struct LLM_KV {
333
348
  llm_arch arch;
334
349
 
335
350
  std::string operator()(llm_kv kv) const {
336
- return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
351
+ return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
337
352
  }
338
353
  };
339
354
 
@@ -641,7 +656,65 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
641
656
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
642
657
  },
643
658
  },
644
-
659
+ {
660
+ LLM_ARCH_ORION,
661
+ {
662
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
663
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
664
+ { LLM_TENSOR_OUTPUT, "output" },
665
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
666
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
667
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
668
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
669
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
670
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
671
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
672
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
673
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
674
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
675
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
676
+ },
677
+ },
678
+ {
679
+ LLM_ARCH_INTERNLM2,
680
+ {
681
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
682
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
683
+ { LLM_TENSOR_OUTPUT, "output" },
684
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
685
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
686
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
687
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
688
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
689
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
690
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
691
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
692
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
693
+ },
694
+ },
695
+ {
696
+ LLM_ARCH_MINICPM,
697
+ {
698
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
699
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
700
+ { LLM_TENSOR_OUTPUT, "output" },
701
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
702
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
703
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
704
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
705
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
706
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
707
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
708
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
709
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
710
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
711
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
712
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
713
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
714
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
715
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
716
+ },
717
+ },
645
718
  {
646
719
  LLM_ARCH_UNKNOWN,
647
720
  {
@@ -699,13 +772,13 @@ struct LLM_TN {
699
772
  // gguf helpers
700
773
  //
701
774
 
702
- static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
775
+ static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
703
776
  { LLAMA_ROPE_SCALING_NONE, "none" },
704
777
  { LLAMA_ROPE_SCALING_LINEAR, "linear" },
705
778
  { LLAMA_ROPE_SCALING_YARN, "yarn" },
706
779
  };
707
780
 
708
- static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
781
+ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
709
782
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
710
783
  if (kv.second == name) {
711
784
  return kv.first;
@@ -1132,10 +1205,10 @@ struct llama_mlock {
1132
1205
  #ifdef __APPLE__
1133
1206
  #define MLOCK_SUGGESTION \
1134
1207
  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
1135
- "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
1208
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
1136
1209
  #else
1137
1210
  #define MLOCK_SUGGESTION \
1138
- "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
1211
+ "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
1139
1212
  #endif
1140
1213
 
1141
1214
  bool raw_lock(const void * addr, size_t size) const {
@@ -1256,8 +1329,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1256
1329
  if (host_buffer) {
1257
1330
  buft = ggml_backend_cuda_host_buffer_type();
1258
1331
  }
1332
+ #elif defined(GGML_USE_SYCL)
1333
+ buft = ggml_backend_sycl_host_buffer_type();
1259
1334
  #elif defined(GGML_USE_CPU_HBM)
1260
1335
  buft = ggml_backend_cpu_hbm_buffer_type();
1336
+ #elif defined(GGML_USE_VULKAN)
1337
+ if (host_buffer) {
1338
+ buft = ggml_backend_vk_host_buffer_type();
1339
+ }
1261
1340
  #endif
1262
1341
 
1263
1342
  if (buft == nullptr) {
@@ -1275,8 +1354,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1275
1354
  buft = ggml_backend_metal_buffer_type();
1276
1355
  #elif defined(GGML_USE_CUBLAS)
1277
1356
  buft = ggml_backend_cuda_buffer_type(gpu);
1357
+ #elif defined(GGML_USE_VULKAN)
1358
+ buft = ggml_backend_vk_buffer_type(gpu);
1359
+ #elif defined(GGML_USE_SYCL)
1360
+ buft = ggml_backend_sycl_buffer_type(gpu);
1278
1361
  #elif defined(GGML_USE_CLBLAST)
1279
1362
  buft = ggml_backend_opencl_buffer_type();
1363
+ #elif defined(GGML_USE_KOMPUTE)
1364
+ buft = ggml_backend_kompute_buffer_type(gpu);
1365
+ if (buft == nullptr) {
1366
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1367
+ }
1280
1368
  #endif
1281
1369
 
1282
1370
  if (buft == nullptr) {
@@ -1304,6 +1392,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1304
1392
  GGML_UNUSED(tensor_split);
1305
1393
  }
1306
1394
 
1395
+ static size_t llama_get_device_count() {
1396
+ #if defined(GGML_USE_CUBLAS)
1397
+ return ggml_backend_cuda_get_device_count();
1398
+ #elif defined(GGML_USE_VULKAN)
1399
+ return ggml_backend_vk_get_device_count();
1400
+ #else
1401
+ return 1;
1402
+ #endif
1403
+ }
1404
+
1405
+ static size_t llama_get_device_memory(int device) {
1406
+ #if defined(GGML_USE_CUBLAS)
1407
+ size_t total;
1408
+ size_t free;
1409
+ ggml_backend_cuda_get_device_memory(device, &total, &free);
1410
+ return free;
1411
+ #elif defined(GGML_USE_VULKAN)
1412
+ size_t total;
1413
+ size_t free;
1414
+ ggml_backend_vk_get_device_memory(device, &total, &free);
1415
+ return free;
1416
+ #else
1417
+ return 1;
1418
+ GGML_UNUSED(device);
1419
+ #endif
1420
+ }
1421
+
1307
1422
  //
1308
1423
  // globals
1309
1424
  //
@@ -1327,12 +1442,15 @@ enum e_model {
1327
1442
  MODEL_UNKNOWN,
1328
1443
  MODEL_0_5B,
1329
1444
  MODEL_1B,
1445
+ MODEL_2B,
1330
1446
  MODEL_3B,
1331
1447
  MODEL_4B,
1332
1448
  MODEL_7B,
1333
1449
  MODEL_8B,
1334
1450
  MODEL_13B,
1451
+ MODEL_14B,
1335
1452
  MODEL_15B,
1453
+ MODEL_20B,
1336
1454
  MODEL_30B,
1337
1455
  MODEL_34B,
1338
1456
  MODEL_40B,
@@ -1350,6 +1468,7 @@ static const size_t GiB = 1024*MiB;
1350
1468
 
1351
1469
  struct llama_hparams {
1352
1470
  bool vocab_only;
1471
+ bool rope_finetuned;
1353
1472
  uint32_t n_vocab;
1354
1473
  uint32_t n_ctx_train; // context size the model was trained on
1355
1474
  uint32_t n_embd;
@@ -1369,8 +1488,7 @@ struct llama_hparams {
1369
1488
  float rope_freq_base_train;
1370
1489
  float rope_freq_scale_train;
1371
1490
  uint32_t n_yarn_orig_ctx;
1372
- int8_t rope_scaling_type_train : 3;
1373
- bool rope_finetuned : 1;
1491
+ int32_t rope_scaling_type_train;
1374
1492
 
1375
1493
  float f_clamp_kqv;
1376
1494
  float f_max_alibi_bias;
@@ -1574,6 +1692,8 @@ struct llama_vocab {
1574
1692
  id special_suffix_id = 32008;
1575
1693
  id special_eot_id = 32010;
1576
1694
 
1695
+ bool add_space_prefix = true;
1696
+
1577
1697
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
1578
1698
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
1579
1699
  GGML_ASSERT(token_left.find('\n') == std::string::npos);
@@ -1670,6 +1790,10 @@ struct llama_context {
1670
1790
  ggml_backend_free(backend);
1671
1791
  }
1672
1792
 
1793
+ #ifdef GGML_USE_VULKAN
1794
+ ggml_vk_free_cpu_assist();
1795
+ #endif
1796
+
1673
1797
  ggml_backend_buffer_free(buf_input);
1674
1798
  ggml_free(ctx_input);
1675
1799
  }
@@ -2323,6 +2447,7 @@ struct llama_model_loader {
2323
2447
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2324
2448
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2325
2449
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2450
+ case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2326
2451
  default:
2327
2452
  {
2328
2453
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2633,7 +2758,7 @@ struct llama_model_loader {
2633
2758
  // load LLaMA models
2634
2759
  //
2635
2760
 
2636
- static std::string llama_model_arch_name(llm_arch arch) {
2761
+ static const char * llama_model_arch_name(llm_arch arch) {
2637
2762
  auto it = LLM_ARCH_NAMES.find(arch);
2638
2763
  if (it == LLM_ARCH_NAMES.end()) {
2639
2764
  return "unknown";
@@ -2668,9 +2793,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2668
2793
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2669
2794
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2670
2795
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2671
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2796
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2672
2797
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2673
2798
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2799
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2674
2800
 
2675
2801
  default: return "unknown, may not work";
2676
2802
  }
@@ -2679,11 +2805,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2679
2805
  static const char * llama_model_type_name(e_model type) {
2680
2806
  switch (type) {
2681
2807
  case MODEL_1B: return "1B";
2808
+ case MODEL_2B: return "2B";
2682
2809
  case MODEL_3B: return "3B";
2683
2810
  case MODEL_7B: return "7B";
2684
2811
  case MODEL_8B: return "8B";
2685
2812
  case MODEL_13B: return "13B";
2813
+ case MODEL_14B: return "14B";
2686
2814
  case MODEL_15B: return "15B";
2815
+ case MODEL_20B: return "20B";
2687
2816
  case MODEL_30B: return "30B";
2688
2817
  case MODEL_34B: return "34B";
2689
2818
  case MODEL_40B: return "40B";
@@ -2696,6 +2825,14 @@ static const char * llama_model_type_name(e_model type) {
2696
2825
  default: return "?B";
2697
2826
  }
2698
2827
  }
2828
+ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2829
+ switch (type) {
2830
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2831
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2832
+ default: return "unknown";
2833
+ }
2834
+ }
2835
+
2699
2836
 
2700
2837
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2701
2838
  model.arch = ml.get_arch();
@@ -2808,6 +2945,15 @@ static void llm_load_hparams(
2808
2945
  default: model.type = e_model::MODEL_UNKNOWN;
2809
2946
  }
2810
2947
  } break;
2948
+ case LLM_ARCH_MINICPM:
2949
+ {
2950
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2951
+
2952
+ switch (hparams.n_layer) {
2953
+ case 40: model.type = e_model::MODEL_2B; break;
2954
+ default: model.type = e_model::MODEL_UNKNOWN;
2955
+ }
2956
+ } break;
2811
2957
  case LLM_ARCH_FALCON:
2812
2958
  {
2813
2959
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2950,7 +3096,24 @@ static void llm_load_hparams(
2950
3096
  default: model.type = e_model::MODEL_UNKNOWN;
2951
3097
  }
2952
3098
  } break;
3099
+ case LLM_ARCH_ORION:
3100
+ {
3101
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2953
3102
 
3103
+ switch (hparams.n_layer) {
3104
+ case 40: model.type = e_model::MODEL_14B; break;
3105
+ default: model.type = e_model::MODEL_UNKNOWN;
3106
+ }
3107
+ } break;
3108
+ case LLM_ARCH_INTERNLM2:
3109
+ {
3110
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3111
+ switch (hparams.n_layer) {
3112
+ case 32: model.type = e_model::MODEL_7B; break;
3113
+ case 48: model.type = e_model::MODEL_20B; break;
3114
+ default: model.type = e_model::MODEL_UNKNOWN;
3115
+ }
3116
+ } break;
2954
3117
  default: (void)0;
2955
3118
  }
2956
3119
 
@@ -3002,6 +3165,11 @@ static void llm_load_vocab(
3002
3165
  vocab.special_unk_id = 0;
3003
3166
  vocab.special_sep_id = -1;
3004
3167
  vocab.special_pad_id = -1;
3168
+
3169
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
3170
+ if (add_space_prefix_keyidx != -1) {
3171
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
3172
+ } // The default value of add_space_prefix is true.
3005
3173
  } else if (tokenizer_name == "gpt2") {
3006
3174
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
3007
3175
 
@@ -3209,12 +3377,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3209
3377
  const auto & hparams = model.hparams;
3210
3378
  const auto & vocab = model.vocab;
3211
3379
 
3212
- const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
3380
+ const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
3213
3381
 
3214
3382
  // hparams
3215
3383
  LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
3216
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
3217
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
3384
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
3385
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
3218
3386
  LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
3219
3387
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
3220
3388
  LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
@@ -3235,7 +3403,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3235
3403
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3236
3404
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3237
3405
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3238
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
3406
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3239
3407
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3240
3408
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
3241
3409
  LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
@@ -3301,22 +3469,18 @@ static bool llm_load_tensors(
3301
3469
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3302
3470
  }
3303
3471
 
3304
- #ifdef GGML_USE_CUBLAS
3305
3472
  if (split_mode == LLAMA_SPLIT_LAYER) {
3306
3473
  // calculate the split points
3307
- int device_count = ggml_backend_cuda_get_device_count();
3474
+ int device_count = llama_get_device_count();
3308
3475
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
3309
- float splits[GGML_CUDA_MAX_DEVICES];
3476
+ std::vector<float> splits(device_count);
3310
3477
  if (all_zero) {
3311
3478
  // default split, by free memory
3312
3479
  for (int i = 0; i < device_count; ++i) {
3313
- size_t total;
3314
- size_t free;
3315
- ggml_backend_cuda_get_device_memory(i, &total, &free);
3316
- splits[i] = free;
3480
+ splits[i] = llama_get_device_memory(i);
3317
3481
  }
3318
3482
  } else {
3319
- std::copy(tensor_split, tensor_split + device_count, splits);
3483
+ std::copy(tensor_split, tensor_split + device_count, splits.begin());
3320
3484
  }
3321
3485
 
3322
3486
  // sum and normalize the splits to get the split points
@@ -3332,19 +3496,17 @@ static bool llm_load_tensors(
3332
3496
  // assign the repeating layers to the devices according to the splits
3333
3497
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
3334
3498
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
3335
- int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
3499
+ int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
3336
3500
  model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
3337
3501
  }
3338
3502
  // assign the output layer
3339
3503
  if (n_gpu_layers > n_layer) {
3340
- int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
3504
+ int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
3341
3505
  model.buft_output = llama_default_buffer_type_offload(layer_gpu);
3342
3506
  } else {
3343
3507
  model.buft_output = llama_default_buffer_type_cpu(true);
3344
3508
  }
3345
- } else
3346
- #endif
3347
- {
3509
+ } else {
3348
3510
  ggml_backend_buffer_type_t split_buft;
3349
3511
  if (split_mode == LLAMA_SPLIT_ROW) {
3350
3512
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
@@ -3423,13 +3585,16 @@ static bool llm_load_tensors(
3423
3585
  switch (model.arch) {
3424
3586
  case LLM_ARCH_LLAMA:
3425
3587
  case LLM_ARCH_REFACT:
3588
+ case LLM_ARCH_MINICPM:
3426
3589
  {
3427
3590
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3428
3591
 
3429
3592
  // output
3430
3593
  {
3431
3594
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3432
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3595
+ if (model.arch != LLM_ARCH_MINICPM){
3596
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3597
+ }
3433
3598
  }
3434
3599
 
3435
3600
  for (int i = 0; i < n_layer; ++i) {
@@ -3933,6 +4098,65 @@ static bool llm_load_tensors(
3933
4098
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3934
4099
  }
3935
4100
  } break;
4101
+ case LLM_ARCH_ORION:
4102
+ {
4103
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4104
+ {
4105
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4106
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4107
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4108
+ }
4109
+ for (int i = 0; i < n_layer; ++i) {
4110
+ ggml_context * ctx_layer = ctx_for_layer(i);
4111
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4112
+
4113
+ auto & layer = model.layers[i];
4114
+
4115
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4116
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4117
+
4118
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4119
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4120
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4121
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4122
+
4123
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4124
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4125
+
4126
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4127
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4128
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4129
+ }
4130
+ } break;
4131
+ case LLM_ARCH_INTERNLM2:
4132
+ {
4133
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4134
+
4135
+ // output
4136
+ {
4137
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4138
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4139
+ }
4140
+
4141
+ for (int i = 0; i < n_layer; ++i) {
4142
+ ggml_context * ctx_layer = ctx_for_layer(i);
4143
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4144
+
4145
+ auto & layer = model.layers[i];
4146
+
4147
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4148
+ // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4149
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4150
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4151
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4152
+
4153
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4154
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4155
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4156
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4157
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4158
+ }
4159
+ } break;
3936
4160
  default:
3937
4161
  throw std::runtime_error("unknown architecture");
3938
4162
  }
@@ -3985,8 +4209,7 @@ static bool llm_load_tensors(
3985
4209
  ctx_bufs.emplace_back(ctx, buf);
3986
4210
  }
3987
4211
 
3988
- // print memory requirements
3989
- {
4212
+ if (llama_supports_gpu_offload()) {
3990
4213
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
3991
4214
 
3992
4215
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -3998,10 +4221,11 @@ static bool llm_load_tensors(
3998
4221
  const int max_offloadable_layers = hparams.n_layer + 1;
3999
4222
 
4000
4223
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
4224
+ }
4001
4225
 
4002
- for (ggml_backend_buffer_t buf : model.bufs) {
4003
- LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
4004
- }
4226
+ // print memory requirements
4227
+ for (ggml_backend_buffer_t buf : model.bufs) {
4228
+ LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
4005
4229
  }
4006
4230
 
4007
4231
  // populate tensors_by_name
@@ -4029,7 +4253,7 @@ static bool llm_load_tensors(
4029
4253
  }
4030
4254
 
4031
4255
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
4032
- static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
4256
+ static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
4033
4257
  try {
4034
4258
  llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
4035
4259
 
@@ -4050,6 +4274,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
4050
4274
  return 0;
4051
4275
  }
4052
4276
 
4277
+ #ifdef GGML_USE_KOMPUTE
4278
+ if (params.n_gpu_layers > 0 && (
4279
+ !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
4280
+ || !(
4281
+ model.ftype == LLAMA_FTYPE_ALL_F32 ||
4282
+ model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
4283
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
4284
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
4285
+ )
4286
+ )) {
4287
+ // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
4288
+ LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
4289
+ params.n_gpu_layers = 0;
4290
+ }
4291
+ #endif
4292
+
4053
4293
  if (!llm_load_tensors(
4054
4294
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
4055
4295
  params.progress_callback, params.progress_callback_user_data
@@ -6366,69 +6606,455 @@ struct llm_build_context {
6366
6606
 
6367
6607
  return gf;
6368
6608
  }
6369
- };
6370
-
6371
- static struct ggml_cgraph * llama_build_graph(
6372
- llama_context & lctx,
6373
- const llama_batch & batch) {
6374
- const auto & model = lctx.model;
6375
6609
 
6376
- // check if we should build the worst-case graph (for memory measurement)
6377
- const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
6378
-
6379
- // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
6380
- llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
6381
- if (il >= 0) {
6382
- ggml_format_name(cur, "%s-%d", name, il);
6383
- } else {
6384
- ggml_set_name(cur, name);
6385
- }
6610
+ struct ggml_cgraph * build_orion() {
6611
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6386
6612
 
6387
- if (!lctx.cparams.offload_kqv) {
6388
- if (strcmp(name, "kqv_merged_cont") == 0) {
6389
- // all nodes between the KV store and the attention output are run on the CPU
6390
- ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
6391
- }
6392
- }
6393
- };
6613
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6614
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6615
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6394
6616
 
6395
- struct ggml_cgraph * result = NULL;
6617
+ struct ggml_tensor * cur;
6618
+ struct ggml_tensor * inpL;
6396
6619
 
6397
- struct llm_build_context llm(lctx, batch, cb, worst_case);
6620
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6621
+ cb(inpL, "inp_embd", -1);
6398
6622
 
6399
- //
6400
- // set input data
6401
- //
6623
+ // inp_pos - contains the positions
6624
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6625
+ cb(inp_pos, "inp_pos", -1);
6402
6626
 
6403
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
6404
- if (batch.token) {
6405
- const int64_t n_tokens = batch.n_tokens;
6627
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6628
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6629
+ cb(KQ_mask, "KQ_mask", -1);
6406
6630
 
6407
- ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
6631
+ // shift the entire K-cache if needed
6632
+ if (do_rope_shift) {
6633
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6408
6634
  }
6409
6635
 
6410
- if (batch.embd) {
6411
- const int64_t n_embd = llm.n_embd;
6412
- const int64_t n_tokens = batch.n_tokens;
6636
+ for (int il = 0; il < n_layer; ++il) {
6637
+ struct ggml_tensor * inpSA = inpL;
6413
6638
 
6414
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
6415
- }
6639
+ // norm
6640
+ cur = llm_build_norm(ctx0, inpL, hparams,
6641
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
6642
+ LLM_NORM, cb, il);
6643
+ cb(cur, "attn_norm", il);
6416
6644
 
6417
- if (batch.pos) {
6418
- const int64_t n_tokens = batch.n_tokens;
6645
+ // self-attention
6646
+ {
6647
+ // compute Q and K and RoPE them
6648
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6649
+ cb(Qcur, "Qcur", il);
6650
+ // if (model.layers[il].bq) {
6651
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6652
+ // cb(Qcur, "Qcur", il);
6653
+ // }
6419
6654
 
6420
- ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
6421
- }
6655
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6656
+ cb(Kcur, "Kcur", il);
6657
+ // if (model.layers[il].bk) {
6658
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6659
+ // cb(Kcur, "Kcur", il);
6660
+ // }
6422
6661
 
6423
- {
6424
- const int64_t n_kv = llm.n_kv;
6425
- const int64_t n_tokens = batch.n_tokens;
6662
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6663
+ cb(Vcur, "Vcur", il);
6664
+ // if (model.layers[il].bv) {
6665
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6666
+ // cb(Vcur, "Vcur", il);
6667
+ // }
6426
6668
 
6427
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
6428
- float * data = (float *) lctx.inp_KQ_mask->data;
6669
+ Qcur = ggml_rope_custom(
6670
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6671
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6672
+ ext_factor, attn_factor, beta_fast, beta_slow
6673
+ );
6674
+ cb(Qcur, "Qcur", il);
6429
6675
 
6430
- for (int h = 0; h < 1; ++h) {
6431
- for (int j = 0; j < n_tokens; ++j) {
6676
+ Kcur = ggml_rope_custom(
6677
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6678
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6679
+ ext_factor, attn_factor, beta_fast, beta_slow
6680
+ );
6681
+ cb(Kcur, "Kcur", il);
6682
+
6683
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6684
+ model.layers[il].wo, NULL,
6685
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6686
+ cb(cur, "kqv_out", il);
6687
+ }
6688
+
6689
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6690
+ cb(ffn_inp, "ffn_inp", il);
6691
+
6692
+ // feed-forward network
6693
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6694
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
6695
+ LLM_NORM, cb, il);
6696
+ cb(cur, "ffn_norm", il);
6697
+
6698
+ cur = llm_build_ffn(ctx0, cur,
6699
+ model.layers[il].ffn_up, NULL,
6700
+ model.layers[il].ffn_gate, NULL,
6701
+ model.layers[il].ffn_down, NULL,
6702
+ NULL,
6703
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6704
+ cb(cur, "ffn_out", il);
6705
+
6706
+ cur = ggml_add(ctx0, cur, ffn_inp);
6707
+ cb(cur, "l_out", il);
6708
+
6709
+ // input for next layer
6710
+ inpL = cur;
6711
+ }
6712
+
6713
+ cur = inpL;
6714
+
6715
+ cur = llm_build_norm(ctx0, cur, hparams,
6716
+ model.output_norm, model.output_norm_b,
6717
+ LLM_NORM, cb, -1);
6718
+ cb(cur, "result_norm", -1);
6719
+
6720
+ // lm_head
6721
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6722
+ cb(cur, "result_output", -1);
6723
+
6724
+ ggml_build_forward_expand(gf, cur);
6725
+
6726
+ return gf;
6727
+ }
6728
+
6729
+ struct ggml_cgraph * build_internlm2() {
6730
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6731
+
6732
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6733
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6734
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6735
+
6736
+ struct ggml_tensor * cur;
6737
+ struct ggml_tensor * inpL;
6738
+
6739
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6740
+ cb(inpL, "inp_embd", -1);
6741
+
6742
+ // inp_pos - contains the positions
6743
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6744
+ cb(inp_pos, "inp_pos", -1);
6745
+
6746
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6747
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6748
+ cb(KQ_mask, "KQ_mask", -1);
6749
+
6750
+ // shift the entire K-cache if needed
6751
+ if (do_rope_shift) {
6752
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6753
+ }
6754
+
6755
+ for (int il = 0; il < n_layer; ++il) {
6756
+ struct ggml_tensor * inpSA = inpL;
6757
+
6758
+ // norm
6759
+ cur = llm_build_norm(ctx0, inpL, hparams,
6760
+ model.layers[il].attn_norm, NULL,
6761
+ LLM_NORM_RMS, cb, il);
6762
+ cb(cur, "attn_norm", il);
6763
+
6764
+ // self-attention
6765
+ {
6766
+ // compute Q and K and RoPE them
6767
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6768
+ cb(Qcur, "Qcur", il);
6769
+ if (model.layers[il].bq) {
6770
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6771
+ cb(Qcur, "Qcur", il);
6772
+ }
6773
+
6774
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6775
+ cb(Kcur, "Kcur", il);
6776
+ if (model.layers[il].bk) {
6777
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6778
+ cb(Kcur, "Kcur", il);
6779
+ }
6780
+
6781
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6782
+ cb(Vcur, "Vcur", il);
6783
+ if (model.layers[il].bv) {
6784
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6785
+ cb(Vcur, "Vcur", il);
6786
+ }
6787
+
6788
+ Qcur = ggml_rope_custom(
6789
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6790
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6791
+ ext_factor, attn_factor, beta_fast, beta_slow
6792
+ );
6793
+ cb(Qcur, "Qcur", il);
6794
+
6795
+ Kcur = ggml_rope_custom(
6796
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6797
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6798
+ ext_factor, attn_factor, beta_fast, beta_slow
6799
+ );
6800
+ cb(Kcur, "Kcur", il);
6801
+
6802
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6803
+ model.layers[il].wo, model.layers[il].bo,
6804
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6805
+ cb(cur, "kqv_out", il);
6806
+ }
6807
+
6808
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6809
+ cb(ffn_inp, "ffn_inp", il);
6810
+
6811
+ // feed-forward network
6812
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6813
+ model.layers[il].ffn_norm, NULL,
6814
+ LLM_NORM_RMS, cb, il);
6815
+ cb(cur, "ffn_norm", il);
6816
+
6817
+ cur = llm_build_ffn(ctx0, cur,
6818
+ model.layers[il].ffn_up, NULL,
6819
+ model.layers[il].ffn_gate, NULL,
6820
+ model.layers[il].ffn_down, NULL,
6821
+ NULL,
6822
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6823
+ cb(cur, "ffn_out", il);
6824
+
6825
+ cur = ggml_add(ctx0, cur, ffn_inp);
6826
+ cb(cur, "l_out", il);
6827
+
6828
+ // input for next layer
6829
+ inpL = cur;
6830
+ }
6831
+
6832
+ cur = inpL;
6833
+
6834
+ cur = llm_build_norm(ctx0, cur, hparams,
6835
+ model.output_norm, NULL,
6836
+ LLM_NORM_RMS, cb, -1);
6837
+ cb(cur, "result_norm", -1);
6838
+
6839
+ // lm_head
6840
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6841
+ cb(cur, "result_output", -1);
6842
+
6843
+ ggml_build_forward_expand(gf, cur);
6844
+
6845
+ return gf;
6846
+ }
6847
+
6848
+ // ref: https://arxiv.org/abs/2203.03466
6849
+ // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
6850
+ // based on the original build_llama() function
6851
+ struct ggml_cgraph * build_minicpm() {
6852
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6853
+
6854
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6855
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6856
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6857
+
6858
+ const int64_t n_embd = hparams.n_embd;
6859
+ //TODO: if the model varies, these parameters need to be read from the model
6860
+ const int64_t n_embd_base = 256;
6861
+ const float scale_embd = 12.0f;
6862
+ const float scale_depth = 1.4f;
6863
+
6864
+ struct ggml_tensor * cur;
6865
+ struct ggml_tensor * inpL;
6866
+
6867
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6868
+ cb(inpL, "inp_embd", -1);
6869
+
6870
+ // scale the input embeddings
6871
+ inpL = ggml_scale(ctx0, inpL, scale_embd);
6872
+ cb(inpL, "inp_scaled", -1);
6873
+
6874
+ // inp_pos - contains the positions
6875
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6876
+ cb(inp_pos, "inp_pos", -1);
6877
+
6878
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6879
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6880
+ cb(KQ_mask, "KQ_mask", -1);
6881
+
6882
+ // shift the entire K-cache if needed
6883
+ if (do_rope_shift) {
6884
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6885
+ }
6886
+
6887
+ for (int il = 0; il < n_layer; ++il) {
6888
+ struct ggml_tensor * inpSA = inpL;
6889
+
6890
+ // norm
6891
+ cur = llm_build_norm(ctx0, inpL, hparams,
6892
+ model.layers[il].attn_norm, NULL,
6893
+ LLM_NORM_RMS, cb, il);
6894
+ cb(cur, "attn_norm", il);
6895
+
6896
+ // self-attention
6897
+ {
6898
+ // compute Q and K and RoPE them
6899
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6900
+ cb(Qcur, "Qcur", il);
6901
+ if (model.layers[il].bq) {
6902
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6903
+ cb(Qcur, "Qcur", il);
6904
+ }
6905
+
6906
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6907
+ cb(Kcur, "Kcur", il);
6908
+ if (model.layers[il].bk) {
6909
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6910
+ cb(Kcur, "Kcur", il);
6911
+ }
6912
+
6913
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6914
+ cb(Vcur, "Vcur", il);
6915
+ if (model.layers[il].bv) {
6916
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6917
+ cb(Vcur, "Vcur", il);
6918
+ }
6919
+
6920
+ Qcur = ggml_rope_custom(
6921
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6922
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6923
+ ext_factor, attn_factor, beta_fast, beta_slow
6924
+ );
6925
+ cb(Qcur, "Qcur", il);
6926
+
6927
+ Kcur = ggml_rope_custom(
6928
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6929
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6930
+ ext_factor, attn_factor, beta_fast, beta_slow
6931
+ );
6932
+ cb(Kcur, "Kcur", il);
6933
+
6934
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6935
+ model.layers[il].wo, model.layers[il].bo,
6936
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6937
+ cb(cur, "kqv_out", il);
6938
+ }
6939
+
6940
+ // scale_res - scale the hidden states for residual connection
6941
+ const float scale_res = scale_depth/sqrtf(float(n_layer));
6942
+ cur = ggml_scale(ctx0, cur, scale_res);
6943
+ cb(cur, "hidden_scaled", -1);
6944
+
6945
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6946
+ cb(ffn_inp, "ffn_inp", il);
6947
+
6948
+ // feed-forward network
6949
+ {
6950
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6951
+ model.layers[il].ffn_norm, NULL,
6952
+ LLM_NORM_RMS, cb, il);
6953
+ cb(cur, "ffn_norm", il);
6954
+
6955
+ cur = llm_build_ffn(ctx0, cur,
6956
+ model.layers[il].ffn_up, NULL,
6957
+ model.layers[il].ffn_gate, NULL,
6958
+ model.layers[il].ffn_down, NULL,
6959
+ NULL,
6960
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6961
+ cb(cur, "ffn_out", il);
6962
+ }
6963
+
6964
+ // scale the hidden states for residual connection
6965
+ cur = ggml_scale(ctx0, cur, scale_res);
6966
+ cb(cur, "hidden_scaled_ffn", -1);
6967
+
6968
+ cur = ggml_add(ctx0, cur, ffn_inp);
6969
+ cb(cur, "l_out", il);
6970
+
6971
+ // input for next layer
6972
+ inpL = cur;
6973
+ }
6974
+
6975
+ cur = inpL;
6976
+
6977
+ cur = llm_build_norm(ctx0, cur, hparams,
6978
+ model.output_norm, NULL,
6979
+ LLM_NORM_RMS, cb, -1);
6980
+ cb(cur, "result_norm", -1);
6981
+
6982
+ // lm_head scaling
6983
+ const float scale_lmhead = float(n_embd_base)/float(n_embd);
6984
+ cur = ggml_scale(ctx0, cur, scale_lmhead);
6985
+ cb(cur, "lmhead_scaling", -1);
6986
+
6987
+ // lm_head
6988
+ cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
6989
+ cb(cur, "result_output", -1);
6990
+
6991
+ ggml_build_forward_expand(gf, cur);
6992
+
6993
+ return gf;
6994
+ }
6995
+ };
6996
+
6997
+ static struct ggml_cgraph * llama_build_graph(
6998
+ llama_context & lctx,
6999
+ const llama_batch & batch) {
7000
+ const auto & model = lctx.model;
7001
+
7002
+ // check if we should build the worst-case graph (for memory measurement)
7003
+ const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
7004
+
7005
+ // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7006
+ llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7007
+ if (il >= 0) {
7008
+ ggml_format_name(cur, "%s-%d", name, il);
7009
+ } else {
7010
+ ggml_set_name(cur, name);
7011
+ }
7012
+
7013
+ if (!lctx.cparams.offload_kqv) {
7014
+ if (strcmp(name, "kqv_merged_cont") == 0) {
7015
+ // all nodes between the KV store and the attention output are run on the CPU
7016
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
7017
+ }
7018
+ }
7019
+ };
7020
+
7021
+ struct ggml_cgraph * result = NULL;
7022
+
7023
+ struct llm_build_context llm(lctx, batch, cb, worst_case);
7024
+
7025
+ //
7026
+ // set input data
7027
+ //
7028
+
7029
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
7030
+ if (batch.token) {
7031
+ const int64_t n_tokens = batch.n_tokens;
7032
+
7033
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7034
+ }
7035
+
7036
+ if (batch.embd) {
7037
+ const int64_t n_embd = llm.n_embd;
7038
+ const int64_t n_tokens = batch.n_tokens;
7039
+
7040
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7041
+ }
7042
+
7043
+ if (batch.pos) {
7044
+ const int64_t n_tokens = batch.n_tokens;
7045
+
7046
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7047
+ }
7048
+
7049
+ {
7050
+ const int64_t n_kv = llm.n_kv;
7051
+ const int64_t n_tokens = batch.n_tokens;
7052
+
7053
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7054
+ float * data = (float *) lctx.inp_KQ_mask->data;
7055
+
7056
+ for (int h = 0; h < 1; ++h) {
7057
+ for (int j = 0; j < n_tokens; ++j) {
6432
7058
  const llama_pos pos = batch.pos[j];
6433
7059
  const llama_seq_id seq_id = batch.seq_id[j][0];
6434
7060
 
@@ -6520,6 +7146,18 @@ static struct ggml_cgraph * llama_build_graph(
6520
7146
  {
6521
7147
  result = llm.build_codeshell();
6522
7148
  } break;
7149
+ case LLM_ARCH_ORION:
7150
+ {
7151
+ result = llm.build_orion();
7152
+ } break;
7153
+ case LLM_ARCH_INTERNLM2:
7154
+ {
7155
+ result = llm.build_internlm2();
7156
+ } break;
7157
+ case LLM_ARCH_MINICPM:
7158
+ {
7159
+ result = llm.build_minicpm();
7160
+ } break;
6523
7161
  default:
6524
7162
  GGML_ASSERT(false);
6525
7163
  }
@@ -6651,11 +7289,6 @@ static int llama_decode_internal(
6651
7289
  n_threads = std::min(4, n_threads);
6652
7290
  }
6653
7291
 
6654
- const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
6655
- if (ggml_cpu_has_cublas() && fully_offloaded) {
6656
- n_threads = 1;
6657
- }
6658
-
6659
7292
  #ifdef GGML_USE_MPI
6660
7293
  const int64_t n_layer = hparams.n_layer;
6661
7294
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -7467,7 +8100,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7467
8100
  //
7468
8101
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7469
8102
  if (&fragment == &fragment_buffer.front()) {
7470
- raw_text = " " + raw_text; // prefix with space if the first token is not special
8103
+ if (vocab.add_space_prefix) {
8104
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
8105
+ }
7471
8106
  }
7472
8107
 
7473
8108
  #ifdef PRETOKENIZERDEBUG
@@ -7946,8 +8581,17 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
7946
8581
  }
7947
8582
 
7948
8583
  void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
8584
+ // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
8585
+ // if (k >= (int32_t)candidates->size) {
8586
+ // return;
8587
+ // }
8588
+
7949
8589
  const int64_t t_start_sample_us = ggml_time_us();
7950
8590
 
8591
+ if (k <= 0) {
8592
+ k = candidates->size;
8593
+ }
8594
+
7951
8595
  k = std::max(k, (int) min_keep);
7952
8596
  k = std::min(k, (int) candidates->size);
7953
8597
 
@@ -8054,21 +8698,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
8054
8698
  return;
8055
8699
  }
8056
8700
 
8057
- llama_sample_softmax(ctx, candidates);
8058
-
8059
8701
  const int64_t t_start_sample_us = ggml_time_us();
8060
8702
 
8061
- float scale = candidates->data[0].p; // scale by max prob
8062
- size_t i = 1; // first token always matches
8703
+ bool min_p_applied = false;
8704
+
8705
+ // if the candidates aren't sorted, try the unsorted implementation first
8706
+ if (!candidates->sorted) {
8707
+ std::vector<llama_token_data> filtered_tokens;
8708
+
8709
+ float max_logit = -FLT_MAX;
8710
+ for (size_t i = 0; i < candidates->size; ++i) {
8711
+ max_logit = std::max(max_logit, candidates->data[i].logit);
8712
+ }
8713
+ const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
8714
+
8715
+ for (size_t i = 0; i < candidates->size; ++i) {
8716
+ if (candidates->data[i].logit >= min_logit) {
8717
+ filtered_tokens.push_back(candidates->data[i]);
8718
+ }
8719
+ }
8063
8720
 
8064
- for (; i < candidates->size; ++i) {
8065
- if (candidates->data[i].p < p * scale && i >= min_keep) {
8066
- break; // prob too small
8721
+ // if we have enough values the operation was a success
8722
+ if (filtered_tokens.size() >= min_keep) {
8723
+ memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
8724
+ candidates->size = filtered_tokens.size();
8725
+ min_p_applied = true;
8067
8726
  }
8068
8727
  }
8069
8728
 
8070
- // Resize the output vector to keep only the matching tokens
8071
- candidates->size = i;
8729
+ // if the candidates are sorted or the unsorted implementation failed, use this implementation
8730
+ if (!min_p_applied) {
8731
+ // Sort the logits in descending order
8732
+ if (!candidates->sorted) {
8733
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
8734
+ return a.logit > b.logit;
8735
+ });
8736
+ candidates->sorted = true;
8737
+ }
8738
+
8739
+ const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
8740
+ size_t i = 1; // first token always matches
8741
+
8742
+ for (; i < candidates->size; ++i) {
8743
+ if (candidates->data[i].logit < min_logit && i >= min_keep) {
8744
+ break; // prob too small
8745
+ }
8746
+ }
8747
+
8748
+ // Resize the output vector to keep only the matching tokens
8749
+ candidates->size = i;
8750
+ }
8072
8751
 
8073
8752
  if (ctx) {
8074
8753
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -8972,6 +9651,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8972
9651
  else if (new_type != GGML_TYPE_Q8_0) {
8973
9652
  new_type = GGML_TYPE_Q6_K;
8974
9653
  }
9654
+ } else if (name == "token_embd.weight") {
9655
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
9656
+ new_type = GGML_TYPE_Q2_K;
9657
+ }
9658
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9659
+ new_type = GGML_TYPE_Q4_K;
9660
+ }
8975
9661
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8976
9662
  if (name.find("attn_v.weight") != std::string::npos) {
8977
9663
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
@@ -8982,7 +9668,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8982
9668
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
8983
9669
  ++qs.i_ffn_down;
8984
9670
  }
8985
- else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8986
9671
  } else if (name.find("attn_v.weight") != std::string::npos) {
8987
9672
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
8988
9673
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -8990,6 +9675,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8990
9675
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
8991
9676
  new_type = GGML_TYPE_Q4_K;
8992
9677
  }
9678
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9679
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
9680
+ }
8993
9681
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8994
9682
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8995
9683
  }
@@ -9027,6 +9715,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9027
9715
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
9028
9716
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
9029
9717
  }
9718
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
9719
+ new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
9720
+ }
9030
9721
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
9031
9722
  new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
9032
9723
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
@@ -9058,13 +9749,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9058
9749
  } else if (name.find("attn_output.weight") != std::string::npos) {
9059
9750
  if (arch != LLM_ARCH_FALCON) {
9060
9751
  if (qs.model.hparams.n_expert == 8) {
9061
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
9752
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
9062
9753
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
9063
9754
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
9064
9755
  new_type = GGML_TYPE_Q5_K;
9065
9756
  }
9066
9757
  } else {
9067
9758
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
9759
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
9068
9760
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
9069
9761
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
9070
9762
  }
@@ -9107,7 +9799,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9107
9799
  bool convert_incompatible_tensor = false;
9108
9800
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
9109
9801
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
9110
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
9802
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
9803
+ new_type == GGML_TYPE_IQ3_XXS) {
9111
9804
  int nx = tensor->ne[0];
9112
9805
  int ny = tensor->ne[1];
9113
9806
  if (nx % QK_K != 0) {
@@ -9121,6 +9814,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9121
9814
  switch (new_type) {
9122
9815
  case GGML_TYPE_IQ2_XXS:
9123
9816
  case GGML_TYPE_IQ2_XS:
9817
+ case GGML_TYPE_IQ3_XXS:
9124
9818
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
9125
9819
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
9126
9820
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -9162,6 +9856,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9162
9856
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9163
9857
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9164
9858
  case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9859
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
9165
9860
 
9166
9861
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9167
9862
  }
@@ -9812,18 +10507,47 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9812
10507
  return result;
9813
10508
  }
9814
10509
 
9815
- int32_t llama_max_devices(void) {
9816
- return LLAMA_MAX_DEVICES;
10510
+ size_t llama_max_devices(void) {
10511
+ #if defined(GGML_USE_METAL)
10512
+ return 1;
10513
+ #elif defined(GGML_USE_CUBLAS)
10514
+ return GGML_CUDA_MAX_DEVICES;
10515
+ #elif defined(GGML_USE_SYCL)
10516
+ return GGML_SYCL_MAX_DEVICES;
10517
+ #elif defined(GGML_USE_VULKAN)
10518
+ return GGML_VK_MAX_DEVICES;
10519
+ #else
10520
+ return 1;
10521
+ #endif
9817
10522
  }
9818
10523
 
9819
- bool llama_mmap_supported(void) {
10524
+ bool llama_supports_mmap(void) {
9820
10525
  return llama_mmap::SUPPORTED;
9821
10526
  }
9822
10527
 
9823
- bool llama_mlock_supported(void) {
10528
+ bool llama_supports_mlock(void) {
9824
10529
  return llama_mlock::SUPPORTED;
9825
10530
  }
9826
10531
 
10532
+ bool llama_supports_gpu_offload(void) {
10533
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
10534
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
10535
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
10536
+ return true;
10537
+ #else
10538
+ return false;
10539
+ #endif
10540
+ }
10541
+
10542
+ // deprecated:
10543
+ bool llama_mmap_supported(void) {
10544
+ return llama_supports_mmap();
10545
+ }
10546
+
10547
+ bool llama_mlock_supported(void) {
10548
+ return llama_supports_mlock();
10549
+ }
10550
+
9827
10551
  void llama_backend_init(bool numa) {
9828
10552
  ggml_time_init();
9829
10553
 
@@ -9855,8 +10579,8 @@ int64_t llama_time_us(void) {
9855
10579
  }
9856
10580
 
9857
10581
  struct llama_model * llama_load_model_from_file(
9858
- const char * path_model,
9859
- struct llama_model_params params) {
10582
+ const char * path_model,
10583
+ struct llama_model_params params) {
9860
10584
  ggml_time_init();
9861
10585
 
9862
10586
  llama_model * model = new llama_model;
@@ -9997,6 +10721,38 @@ struct llama_context * llama_new_context_with_model(
9997
10721
  }
9998
10722
  }
9999
10723
  }
10724
+ #elif defined(GGML_USE_VULKAN)
10725
+ if (model->n_gpu_layers > 0) {
10726
+ for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
10727
+ ggml_backend_t backend = ggml_backend_vk_init(device);
10728
+ if (backend == nullptr) {
10729
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
10730
+ llama_free(ctx);
10731
+ return nullptr;
10732
+ }
10733
+ ctx->backends.push_back(backend);
10734
+ }
10735
+ }
10736
+ #elif defined(GGML_USE_SYCL)
10737
+ if (model->n_gpu_layers > 0) {
10738
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
10739
+ if (backend == nullptr) {
10740
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
10741
+ llama_free(ctx);
10742
+ return nullptr;
10743
+ }
10744
+ ctx->backends.push_back(backend);
10745
+ }
10746
+ #elif defined(GGML_USE_KOMPUTE)
10747
+ if (model->n_gpu_layers > 0) {
10748
+ auto * backend = ggml_backend_kompute_init(model->main_gpu);
10749
+ if (backend == nullptr) {
10750
+ LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
10751
+ llama_free(ctx);
10752
+ return nullptr;
10753
+ }
10754
+ ctx->backends.push_back(backend);
10755
+ }
10000
10756
  #endif
10001
10757
  ctx->backend_cpu = ggml_backend_cpu_init();
10002
10758
  if (ctx->backend_cpu == nullptr) {
@@ -10202,7 +10958,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
10202
10958
 
10203
10959
  int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
10204
10960
  return snprintf(buf, buf_size, "%s %s %s",
10205
- llama_model_arch_name(model->arch).c_str(),
10961
+ llama_model_arch_name(model->arch),
10206
10962
  llama_model_type_name(model->type),
10207
10963
  llama_model_ftype_name(model->ftype).c_str());
10208
10964
  }
@@ -10844,22 +11600,24 @@ struct llama_batch llama_batch_get_one(
10844
11600
  };
10845
11601
  }
10846
11602
 
10847
- struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
11603
+ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
10848
11604
  llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
10849
11605
 
10850
11606
  if (embd) {
10851
- batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
11607
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
10852
11608
  } else {
10853
- batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
11609
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
10854
11610
  }
10855
11611
 
10856
- batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
10857
- batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
10858
- batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
10859
- for (int i = 0; i < n_tokens; ++i) {
11612
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
11613
+ batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
11614
+ batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
11615
+ for (int i = 0; i < n_tokens_alloc; ++i) {
10860
11616
  batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
10861
11617
  }
10862
- batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
11618
+ batch.seq_id[n_tokens_alloc] = nullptr;
11619
+
11620
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
10863
11621
 
10864
11622
  return batch;
10865
11623
  }
@@ -10870,7 +11628,7 @@ void llama_batch_free(struct llama_batch batch) {
10870
11628
  if (batch.pos) free(batch.pos);
10871
11629
  if (batch.n_seq_id) free(batch.n_seq_id);
10872
11630
  if (batch.seq_id) {
10873
- for (int i = 0; i < batch.n_tokens; ++i) {
11631
+ for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
10874
11632
  free(batch.seq_id[i]);
10875
11633
  }
10876
11634
  free(batch.seq_id);