llama_cpp 0.12.3 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,12 @@
11
11
  # include "ggml-cuda.h"
12
12
  #elif defined(GGML_USE_CLBLAST)
13
13
  # include "ggml-opencl.h"
14
+ #elif defined(GGML_USE_VULKAN)
15
+ # include "ggml-vulkan.h"
16
+ #elif defined(GGML_USE_SYCL)
17
+ # include "ggml-sycl.h"
18
+ #elif defined(GGML_USE_KOMPUTE)
19
+ # include "ggml-kompute.h"
14
20
  #endif
15
21
 
16
22
  #ifdef GGML_USE_METAL
@@ -52,6 +58,7 @@
52
58
  #include <algorithm>
53
59
  #include <array>
54
60
  #include <cassert>
61
+ #include <cfloat>
55
62
  #include <cinttypes>
56
63
  #include <climits>
57
64
  #include <cmath>
@@ -196,6 +203,8 @@ enum llm_arch {
196
203
  LLM_ARCH_PHI2,
197
204
  LLM_ARCH_PLAMO,
198
205
  LLM_ARCH_CODESHELL,
206
+ LLM_ARCH_ORION,
207
+ LLM_ARCH_INTERNLM2,
199
208
  LLM_ARCH_UNKNOWN,
200
209
  };
201
210
 
@@ -217,6 +226,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
217
226
  { LLM_ARCH_PHI2, "phi2" },
218
227
  { LLM_ARCH_PLAMO, "plamo" },
219
228
  { LLM_ARCH_CODESHELL, "codeshell" },
229
+ { LLM_ARCH_ORION, "orion" },
230
+ { LLM_ARCH_INTERNLM2, "internlm2" },
220
231
  };
221
232
 
222
233
  enum llm_kv {
@@ -269,6 +280,7 @@ enum llm_kv {
269
280
  LLM_KV_TOKENIZER_PAD_ID,
270
281
  LLM_KV_TOKENIZER_ADD_BOS,
271
282
  LLM_KV_TOKENIZER_ADD_EOS,
283
+ LLM_KV_TOKENIZER_ADD_PREFIX,
272
284
  LLM_KV_TOKENIZER_HF_JSON,
273
285
  LLM_KV_TOKENIZER_RWKV,
274
286
  };
@@ -323,6 +335,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
323
335
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
324
336
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
325
337
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
338
+ { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
326
339
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
327
340
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
328
341
  };
@@ -641,7 +654,42 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
641
654
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
642
655
  },
643
656
  },
644
-
657
+ {
658
+ LLM_ARCH_ORION,
659
+ {
660
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
661
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
662
+ { LLM_TENSOR_OUTPUT, "output" },
663
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
664
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
665
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
666
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
667
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
668
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
669
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
670
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
671
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
672
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
673
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
674
+ },
675
+ },
676
+ {
677
+ LLM_ARCH_INTERNLM2,
678
+ {
679
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
680
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
681
+ { LLM_TENSOR_OUTPUT, "output" },
682
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
683
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
684
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
685
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
686
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
687
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
688
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
689
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
690
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
691
+ },
692
+ },
645
693
  {
646
694
  LLM_ARCH_UNKNOWN,
647
695
  {
@@ -1132,10 +1180,10 @@ struct llama_mlock {
1132
1180
  #ifdef __APPLE__
1133
1181
  #define MLOCK_SUGGESTION \
1134
1182
  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
1135
- "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
1183
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
1136
1184
  #else
1137
1185
  #define MLOCK_SUGGESTION \
1138
- "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
1186
+ "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
1139
1187
  #endif
1140
1188
 
1141
1189
  bool raw_lock(const void * addr, size_t size) const {
@@ -1256,8 +1304,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1256
1304
  if (host_buffer) {
1257
1305
  buft = ggml_backend_cuda_host_buffer_type();
1258
1306
  }
1307
+ #elif defined(GGML_USE_SYCL)
1308
+ buft = ggml_backend_sycl_host_buffer_type();
1259
1309
  #elif defined(GGML_USE_CPU_HBM)
1260
1310
  buft = ggml_backend_cpu_hbm_buffer_type();
1311
+ #elif defined(GGML_USE_VULKAN)
1312
+ if (host_buffer) {
1313
+ buft = ggml_backend_vk_host_buffer_type();
1314
+ }
1261
1315
  #endif
1262
1316
 
1263
1317
  if (buft == nullptr) {
@@ -1275,8 +1329,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1275
1329
  buft = ggml_backend_metal_buffer_type();
1276
1330
  #elif defined(GGML_USE_CUBLAS)
1277
1331
  buft = ggml_backend_cuda_buffer_type(gpu);
1332
+ #elif defined(GGML_USE_VULKAN)
1333
+ buft = ggml_backend_vk_buffer_type();
1334
+ #elif defined(GGML_USE_SYCL)
1335
+ buft = ggml_backend_sycl_buffer_type(gpu);
1278
1336
  #elif defined(GGML_USE_CLBLAST)
1279
1337
  buft = ggml_backend_opencl_buffer_type();
1338
+ #elif defined(GGML_USE_KOMPUTE)
1339
+ buft = ggml_backend_kompute_buffer_type(gpu);
1340
+ if (buft == nullptr) {
1341
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1342
+ }
1280
1343
  #endif
1281
1344
 
1282
1345
  if (buft == nullptr) {
@@ -1332,7 +1395,9 @@ enum e_model {
1332
1395
  MODEL_7B,
1333
1396
  MODEL_8B,
1334
1397
  MODEL_13B,
1398
+ MODEL_14B,
1335
1399
  MODEL_15B,
1400
+ MODEL_20B,
1336
1401
  MODEL_30B,
1337
1402
  MODEL_34B,
1338
1403
  MODEL_40B,
@@ -1574,6 +1639,8 @@ struct llama_vocab {
1574
1639
  id special_suffix_id = 32008;
1575
1640
  id special_eot_id = 32010;
1576
1641
 
1642
+ bool add_space_prefix = true;
1643
+
1577
1644
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
1578
1645
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
1579
1646
  GGML_ASSERT(token_left.find('\n') == std::string::npos);
@@ -2323,6 +2390,7 @@ struct llama_model_loader {
2323
2390
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2324
2391
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2325
2392
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2393
+ case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2326
2394
  default:
2327
2395
  {
2328
2396
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2668,9 +2736,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2668
2736
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2669
2737
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2670
2738
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2671
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2739
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2672
2740
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2673
2741
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2742
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2674
2743
 
2675
2744
  default: return "unknown, may not work";
2676
2745
  }
@@ -2683,7 +2752,9 @@ static const char * llama_model_type_name(e_model type) {
2683
2752
  case MODEL_7B: return "7B";
2684
2753
  case MODEL_8B: return "8B";
2685
2754
  case MODEL_13B: return "13B";
2755
+ case MODEL_14B: return "14B";
2686
2756
  case MODEL_15B: return "15B";
2757
+ case MODEL_20B: return "20B";
2687
2758
  case MODEL_30B: return "30B";
2688
2759
  case MODEL_34B: return "34B";
2689
2760
  case MODEL_40B: return "40B";
@@ -2696,6 +2767,14 @@ static const char * llama_model_type_name(e_model type) {
2696
2767
  default: return "?B";
2697
2768
  }
2698
2769
  }
2770
+ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2771
+ switch (type) {
2772
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2773
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2774
+ default: return "unknown";
2775
+ }
2776
+ }
2777
+
2699
2778
 
2700
2779
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2701
2780
  model.arch = ml.get_arch();
@@ -2950,7 +3029,24 @@ static void llm_load_hparams(
2950
3029
  default: model.type = e_model::MODEL_UNKNOWN;
2951
3030
  }
2952
3031
  } break;
3032
+ case LLM_ARCH_ORION:
3033
+ {
3034
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2953
3035
 
3036
+ switch (hparams.n_layer) {
3037
+ case 40: model.type = e_model::MODEL_14B; break;
3038
+ default: model.type = e_model::MODEL_UNKNOWN;
3039
+ }
3040
+ } break;
3041
+ case LLM_ARCH_INTERNLM2:
3042
+ {
3043
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3044
+ switch (hparams.n_layer) {
3045
+ case 32: model.type = e_model::MODEL_7B; break;
3046
+ case 48: model.type = e_model::MODEL_20B; break;
3047
+ default: model.type = e_model::MODEL_UNKNOWN;
3048
+ }
3049
+ } break;
2954
3050
  default: (void)0;
2955
3051
  }
2956
3052
 
@@ -3002,6 +3098,11 @@ static void llm_load_vocab(
3002
3098
  vocab.special_unk_id = 0;
3003
3099
  vocab.special_sep_id = -1;
3004
3100
  vocab.special_pad_id = -1;
3101
+
3102
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
3103
+ if (add_space_prefix_keyidx != -1) {
3104
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
3105
+ } // The default value of add_space_prefix is true.
3005
3106
  } else if (tokenizer_name == "gpt2") {
3006
3107
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
3007
3108
 
@@ -3214,7 +3315,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3214
3315
  // hparams
3215
3316
  LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
3216
3317
  LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
3217
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
3318
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
3218
3319
  LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
3219
3320
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
3220
3321
  LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
@@ -3933,6 +4034,65 @@ static bool llm_load_tensors(
3933
4034
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3934
4035
  }
3935
4036
  } break;
4037
+ case LLM_ARCH_ORION:
4038
+ {
4039
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4040
+ {
4041
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4042
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4043
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4044
+ }
4045
+ for (int i = 0; i < n_layer; ++i) {
4046
+ ggml_context * ctx_layer = ctx_for_layer(i);
4047
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4048
+
4049
+ auto & layer = model.layers[i];
4050
+
4051
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4052
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4053
+
4054
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4055
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4056
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4057
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4058
+
4059
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4060
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4061
+
4062
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4063
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4064
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4065
+ }
4066
+ } break;
4067
+ case LLM_ARCH_INTERNLM2:
4068
+ {
4069
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4070
+
4071
+ // output
4072
+ {
4073
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4074
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4075
+ }
4076
+
4077
+ for (int i = 0; i < n_layer; ++i) {
4078
+ ggml_context * ctx_layer = ctx_for_layer(i);
4079
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4080
+
4081
+ auto & layer = model.layers[i];
4082
+
4083
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4084
+ // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4085
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4086
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4087
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4088
+
4089
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4090
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4091
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4092
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4093
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4094
+ }
4095
+ } break;
3936
4096
  default:
3937
4097
  throw std::runtime_error("unknown architecture");
3938
4098
  }
@@ -4029,7 +4189,7 @@ static bool llm_load_tensors(
4029
4189
  }
4030
4190
 
4031
4191
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
4032
- static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
4192
+ static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
4033
4193
  try {
4034
4194
  llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
4035
4195
 
@@ -4050,6 +4210,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
4050
4210
  return 0;
4051
4211
  }
4052
4212
 
4213
+ #ifdef GGML_USE_KOMPUTE
4214
+ if (params.n_gpu_layers > 0 && (
4215
+ !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
4216
+ || !(
4217
+ model.ftype == LLAMA_FTYPE_ALL_F32 ||
4218
+ model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
4219
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
4220
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
4221
+ )
4222
+ )) {
4223
+ // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
4224
+ LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
4225
+ params.n_gpu_layers = 0;
4226
+ }
4227
+ #endif
4228
+
4053
4229
  if (!llm_load_tensors(
4054
4230
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
4055
4231
  params.progress_callback, params.progress_callback_user_data
@@ -6366,6 +6542,245 @@ struct llm_build_context {
6366
6542
 
6367
6543
  return gf;
6368
6544
  }
6545
+
6546
+ struct ggml_cgraph * build_orion() {
6547
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6548
+
6549
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6550
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6551
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6552
+
6553
+ struct ggml_tensor * cur;
6554
+ struct ggml_tensor * inpL;
6555
+
6556
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6557
+ cb(inpL, "inp_embd", -1);
6558
+
6559
+ // inp_pos - contains the positions
6560
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6561
+ cb(inp_pos, "inp_pos", -1);
6562
+
6563
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6564
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6565
+ cb(KQ_mask, "KQ_mask", -1);
6566
+
6567
+ // shift the entire K-cache if needed
6568
+ if (do_rope_shift) {
6569
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6570
+ }
6571
+
6572
+ for (int il = 0; il < n_layer; ++il) {
6573
+ struct ggml_tensor * inpSA = inpL;
6574
+
6575
+ // norm
6576
+ cur = llm_build_norm(ctx0, inpL, hparams,
6577
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
6578
+ LLM_NORM, cb, il);
6579
+ cb(cur, "attn_norm", il);
6580
+
6581
+ // self-attention
6582
+ {
6583
+ // compute Q and K and RoPE them
6584
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6585
+ cb(Qcur, "Qcur", il);
6586
+ // if (model.layers[il].bq) {
6587
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6588
+ // cb(Qcur, "Qcur", il);
6589
+ // }
6590
+
6591
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6592
+ cb(Kcur, "Kcur", il);
6593
+ // if (model.layers[il].bk) {
6594
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6595
+ // cb(Kcur, "Kcur", il);
6596
+ // }
6597
+
6598
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6599
+ cb(Vcur, "Vcur", il);
6600
+ // if (model.layers[il].bv) {
6601
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6602
+ // cb(Vcur, "Vcur", il);
6603
+ // }
6604
+
6605
+ Qcur = ggml_rope_custom(
6606
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6607
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6608
+ ext_factor, attn_factor, beta_fast, beta_slow
6609
+ );
6610
+ cb(Qcur, "Qcur", il);
6611
+
6612
+ Kcur = ggml_rope_custom(
6613
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6614
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6615
+ ext_factor, attn_factor, beta_fast, beta_slow
6616
+ );
6617
+ cb(Kcur, "Kcur", il);
6618
+
6619
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6620
+ model.layers[il].wo, NULL,
6621
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6622
+ cb(cur, "kqv_out", il);
6623
+ }
6624
+
6625
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6626
+ cb(ffn_inp, "ffn_inp", il);
6627
+
6628
+ // feed-forward network
6629
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6630
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
6631
+ LLM_NORM, cb, il);
6632
+ cb(cur, "ffn_norm", il);
6633
+
6634
+ cur = llm_build_ffn(ctx0, cur,
6635
+ model.layers[il].ffn_up, NULL,
6636
+ model.layers[il].ffn_gate, NULL,
6637
+ model.layers[il].ffn_down, NULL,
6638
+ NULL,
6639
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6640
+ cb(cur, "ffn_out", il);
6641
+
6642
+ cur = ggml_add(ctx0, cur, ffn_inp);
6643
+ cb(cur, "l_out", il);
6644
+
6645
+ // input for next layer
6646
+ inpL = cur;
6647
+ }
6648
+
6649
+ cur = inpL;
6650
+
6651
+ cur = llm_build_norm(ctx0, cur, hparams,
6652
+ model.output_norm, model.output_norm_b,
6653
+ LLM_NORM, cb, -1);
6654
+ cb(cur, "result_norm", -1);
6655
+
6656
+ // lm_head
6657
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6658
+ cb(cur, "result_output", -1);
6659
+
6660
+ ggml_build_forward_expand(gf, cur);
6661
+
6662
+ return gf;
6663
+ }
6664
+
6665
+ struct ggml_cgraph * build_internlm2() {
6666
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6667
+
6668
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6669
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6670
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6671
+
6672
+ struct ggml_tensor * cur;
6673
+ struct ggml_tensor * inpL;
6674
+
6675
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6676
+ cb(inpL, "inp_embd", -1);
6677
+
6678
+ // inp_pos - contains the positions
6679
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6680
+ cb(inp_pos, "inp_pos", -1);
6681
+
6682
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6683
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6684
+ cb(KQ_mask, "KQ_mask", -1);
6685
+
6686
+ // shift the entire K-cache if needed
6687
+ if (do_rope_shift) {
6688
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6689
+ }
6690
+
6691
+ for (int il = 0; il < n_layer; ++il) {
6692
+ struct ggml_tensor * inpSA = inpL;
6693
+
6694
+ // norm
6695
+ cur = llm_build_norm(ctx0, inpL, hparams,
6696
+ model.layers[il].attn_norm, NULL,
6697
+ LLM_NORM_RMS, cb, il);
6698
+ cb(cur, "attn_norm", il);
6699
+
6700
+ // self-attention
6701
+ {
6702
+ // compute Q and K and RoPE them
6703
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6704
+ cb(Qcur, "Qcur", il);
6705
+ if (model.layers[il].bq) {
6706
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6707
+ cb(Qcur, "Qcur", il);
6708
+ }
6709
+
6710
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6711
+ cb(Kcur, "Kcur", il);
6712
+ if (model.layers[il].bk) {
6713
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6714
+ cb(Kcur, "Kcur", il);
6715
+ }
6716
+
6717
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6718
+ cb(Vcur, "Vcur", il);
6719
+ if (model.layers[il].bv) {
6720
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6721
+ cb(Vcur, "Vcur", il);
6722
+ }
6723
+
6724
+ Qcur = ggml_rope_custom(
6725
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6726
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6727
+ ext_factor, attn_factor, beta_fast, beta_slow
6728
+ );
6729
+ cb(Qcur, "Qcur", il);
6730
+
6731
+ Kcur = ggml_rope_custom(
6732
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6733
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6734
+ ext_factor, attn_factor, beta_fast, beta_slow
6735
+ );
6736
+ cb(Kcur, "Kcur", il);
6737
+
6738
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6739
+ model.layers[il].wo, model.layers[il].bo,
6740
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6741
+ cb(cur, "kqv_out", il);
6742
+ }
6743
+
6744
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6745
+ cb(ffn_inp, "ffn_inp", il);
6746
+
6747
+ // feed-forward network
6748
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6749
+ model.layers[il].ffn_norm, NULL,
6750
+ LLM_NORM_RMS, cb, il);
6751
+ cb(cur, "ffn_norm", il);
6752
+
6753
+ cur = llm_build_ffn(ctx0, cur,
6754
+ model.layers[il].ffn_up, NULL,
6755
+ model.layers[il].ffn_gate, NULL,
6756
+ model.layers[il].ffn_down, NULL,
6757
+ NULL,
6758
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6759
+ cb(cur, "ffn_out", il);
6760
+
6761
+ cur = ggml_add(ctx0, cur, ffn_inp);
6762
+ cb(cur, "l_out", il);
6763
+
6764
+ // input for next layer
6765
+ inpL = cur;
6766
+ }
6767
+
6768
+ cur = inpL;
6769
+
6770
+ cur = llm_build_norm(ctx0, cur, hparams,
6771
+ model.output_norm, NULL,
6772
+ LLM_NORM_RMS, cb, -1);
6773
+ cb(cur, "result_norm", -1);
6774
+
6775
+ // lm_head
6776
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6777
+ cb(cur, "result_output", -1);
6778
+
6779
+ ggml_build_forward_expand(gf, cur);
6780
+
6781
+ return gf;
6782
+ }
6783
+
6369
6784
  };
6370
6785
 
6371
6786
  static struct ggml_cgraph * llama_build_graph(
@@ -6520,6 +6935,14 @@ static struct ggml_cgraph * llama_build_graph(
6520
6935
  {
6521
6936
  result = llm.build_codeshell();
6522
6937
  } break;
6938
+ case LLM_ARCH_ORION:
6939
+ {
6940
+ result = llm.build_orion();
6941
+ } break;
6942
+ case LLM_ARCH_INTERNLM2:
6943
+ {
6944
+ result = llm.build_internlm2();
6945
+ } break;
6523
6946
  default:
6524
6947
  GGML_ASSERT(false);
6525
6948
  }
@@ -6651,11 +7074,6 @@ static int llama_decode_internal(
6651
7074
  n_threads = std::min(4, n_threads);
6652
7075
  }
6653
7076
 
6654
- const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
6655
- if (ggml_cpu_has_cublas() && fully_offloaded) {
6656
- n_threads = 1;
6657
- }
6658
-
6659
7077
  #ifdef GGML_USE_MPI
6660
7078
  const int64_t n_layer = hparams.n_layer;
6661
7079
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -7467,7 +7885,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7467
7885
  //
7468
7886
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7469
7887
  if (&fragment == &fragment_buffer.front()) {
7470
- raw_text = " " + raw_text; // prefix with space if the first token is not special
7888
+ if (vocab.add_space_prefix) {
7889
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
7890
+ }
7471
7891
  }
7472
7892
 
7473
7893
  #ifdef PRETOKENIZERDEBUG
@@ -7946,6 +8366,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
7946
8366
  }
7947
8367
 
7948
8368
  void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
8369
+ // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
8370
+ // if (k >= (int32_t)candidates->size) {
8371
+ // return;
8372
+ // }
8373
+
7949
8374
  const int64_t t_start_sample_us = ggml_time_us();
7950
8375
 
7951
8376
  k = std::max(k, (int) min_keep);
@@ -8054,21 +8479,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
8054
8479
  return;
8055
8480
  }
8056
8481
 
8057
- llama_sample_softmax(ctx, candidates);
8058
-
8059
8482
  const int64_t t_start_sample_us = ggml_time_us();
8060
8483
 
8061
- float scale = candidates->data[0].p; // scale by max prob
8062
- size_t i = 1; // first token always matches
8484
+ bool min_p_applied = false;
8485
+
8486
+ // if the candidates aren't sorted, try the unsorted implementation first
8487
+ if (!candidates->sorted) {
8488
+ std::vector<llama_token_data> filtered_tokens;
8489
+
8490
+ float max_logit = -FLT_MAX;
8491
+ for (size_t i = 0; i < candidates->size; ++i) {
8492
+ max_logit = std::max(max_logit, candidates->data[i].logit);
8493
+ }
8494
+ const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
8495
+
8496
+ for (size_t i = 0; i < candidates->size; ++i) {
8497
+ if (candidates->data[i].logit >= min_logit) {
8498
+ filtered_tokens.push_back(candidates->data[i]);
8499
+ }
8500
+ }
8063
8501
 
8064
- for (; i < candidates->size; ++i) {
8065
- if (candidates->data[i].p < p * scale && i >= min_keep) {
8066
- break; // prob too small
8502
+ // if we have enough values the operation was a success
8503
+ if (filtered_tokens.size() >= min_keep) {
8504
+ memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
8505
+ candidates->size = filtered_tokens.size();
8506
+ min_p_applied = true;
8067
8507
  }
8068
8508
  }
8069
8509
 
8070
- // Resize the output vector to keep only the matching tokens
8071
- candidates->size = i;
8510
+ // if the candidates are sorted or the unsorted implementation failed, use this implementation
8511
+ if (!min_p_applied) {
8512
+ // Sort the logits in descending order
8513
+ if (!candidates->sorted) {
8514
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
8515
+ return a.logit > b.logit;
8516
+ });
8517
+ candidates->sorted = true;
8518
+ }
8519
+
8520
+ const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
8521
+ size_t i = 1; // first token always matches
8522
+
8523
+ for (; i < candidates->size; ++i) {
8524
+ if (candidates->data[i].logit < min_logit && i >= min_keep) {
8525
+ break; // prob too small
8526
+ }
8527
+ }
8528
+
8529
+ // Resize the output vector to keep only the matching tokens
8530
+ candidates->size = i;
8531
+ }
8072
8532
 
8073
8533
  if (ctx) {
8074
8534
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -8972,6 +9432,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8972
9432
  else if (new_type != GGML_TYPE_Q8_0) {
8973
9433
  new_type = GGML_TYPE_Q6_K;
8974
9434
  }
9435
+ } else if (name == "token_embd.weight") {
9436
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
9437
+ new_type = GGML_TYPE_Q2_K;
9438
+ }
9439
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9440
+ new_type = GGML_TYPE_Q4_K;
9441
+ }
8975
9442
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8976
9443
  if (name.find("attn_v.weight") != std::string::npos) {
8977
9444
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
@@ -8982,7 +9449,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8982
9449
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
8983
9450
  ++qs.i_ffn_down;
8984
9451
  }
8985
- else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8986
9452
  } else if (name.find("attn_v.weight") != std::string::npos) {
8987
9453
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
8988
9454
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -8990,6 +9456,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8990
9456
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
8991
9457
  new_type = GGML_TYPE_Q4_K;
8992
9458
  }
9459
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) {
9460
+ new_type = GGML_TYPE_Q4_K;
9461
+ }
8993
9462
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8994
9463
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8995
9464
  }
@@ -9027,6 +9496,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9027
9496
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
9028
9497
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
9029
9498
  }
9499
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9500
+ // if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
9501
+ //}
9030
9502
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
9031
9503
  new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
9032
9504
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
@@ -9058,13 +9530,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9058
9530
  } else if (name.find("attn_output.weight") != std::string::npos) {
9059
9531
  if (arch != LLM_ARCH_FALCON) {
9060
9532
  if (qs.model.hparams.n_expert == 8) {
9061
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
9533
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
9062
9534
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
9063
9535
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
9064
9536
  new_type = GGML_TYPE_Q5_K;
9065
9537
  }
9066
9538
  } else {
9067
9539
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
9540
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
9068
9541
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
9069
9542
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
9070
9543
  }
@@ -9107,7 +9580,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9107
9580
  bool convert_incompatible_tensor = false;
9108
9581
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
9109
9582
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
9110
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
9583
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
9584
+ new_type == GGML_TYPE_IQ3_XXS) {
9111
9585
  int nx = tensor->ne[0];
9112
9586
  int ny = tensor->ne[1];
9113
9587
  if (nx % QK_K != 0) {
@@ -9121,6 +9595,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9121
9595
  switch (new_type) {
9122
9596
  case GGML_TYPE_IQ2_XXS:
9123
9597
  case GGML_TYPE_IQ2_XS:
9598
+ case GGML_TYPE_IQ3_XXS:
9124
9599
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
9125
9600
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
9126
9601
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -9162,6 +9637,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9162
9637
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9163
9638
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9164
9639
  case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9640
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
9165
9641
 
9166
9642
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9167
9643
  }
@@ -9812,18 +10288,45 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9812
10288
  return result;
9813
10289
  }
9814
10290
 
9815
- int32_t llama_max_devices(void) {
9816
- return LLAMA_MAX_DEVICES;
10291
+ size_t llama_max_devices(void) {
10292
+ #if defined(GGML_USE_METAL)
10293
+ return 1;
10294
+ #elif defined(GGML_USE_CUBLAS)
10295
+ return GGML_CUDA_MAX_DEVICES;
10296
+ #elif defined(GGML_USE_SYCL)
10297
+ return GGML_SYCL_MAX_DEVICES;
10298
+ #else
10299
+ return 1;
10300
+ #endif
9817
10301
  }
9818
10302
 
9819
- bool llama_mmap_supported(void) {
10303
+ bool llama_supports_mmap(void) {
9820
10304
  return llama_mmap::SUPPORTED;
9821
10305
  }
9822
10306
 
9823
- bool llama_mlock_supported(void) {
10307
+ bool llama_supports_mlock(void) {
9824
10308
  return llama_mlock::SUPPORTED;
9825
10309
  }
9826
10310
 
10311
+ bool llama_supports_gpu_offload(void) {
10312
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
10313
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
10314
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
10315
+ return true;
10316
+ #else
10317
+ return false;
10318
+ #endif
10319
+ }
10320
+
10321
+ // deprecated:
10322
+ bool llama_mmap_supported(void) {
10323
+ return llama_supports_mmap();
10324
+ }
10325
+
10326
+ bool llama_mlock_supported(void) {
10327
+ return llama_supports_mlock();
10328
+ }
10329
+
9827
10330
  void llama_backend_init(bool numa) {
9828
10331
  ggml_time_init();
9829
10332
 
@@ -9855,8 +10358,8 @@ int64_t llama_time_us(void) {
9855
10358
  }
9856
10359
 
9857
10360
  struct llama_model * llama_load_model_from_file(
9858
- const char * path_model,
9859
- struct llama_model_params params) {
10361
+ const char * path_model,
10362
+ struct llama_model_params params) {
9860
10363
  ggml_time_init();
9861
10364
 
9862
10365
  llama_model * model = new llama_model;
@@ -9997,6 +10500,36 @@ struct llama_context * llama_new_context_with_model(
9997
10500
  }
9998
10501
  }
9999
10502
  }
10503
+ #elif defined(GGML_USE_VULKAN)
10504
+ if (model->n_gpu_layers > 0) {
10505
+ ggml_backend_t backend = ggml_backend_vk_init();
10506
+ if (backend == nullptr) {
10507
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
10508
+ llama_free(ctx);
10509
+ return nullptr;
10510
+ }
10511
+ ctx->backends.push_back(backend);
10512
+ }
10513
+ #elif defined(GGML_USE_SYCL)
10514
+ if (model->n_gpu_layers > 0) {
10515
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
10516
+ if (backend == nullptr) {
10517
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
10518
+ llama_free(ctx);
10519
+ return nullptr;
10520
+ }
10521
+ ctx->backends.push_back(backend);
10522
+ }
10523
+ #elif defined(GGML_USE_KOMPUTE)
10524
+ if (model->n_gpu_layers > 0) {
10525
+ auto * backend = ggml_backend_kompute_init(model->main_gpu);
10526
+ if (backend == nullptr) {
10527
+ LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
10528
+ llama_free(ctx);
10529
+ return nullptr;
10530
+ }
10531
+ ctx->backends.push_back(backend);
10532
+ }
10000
10533
  #endif
10001
10534
  ctx->backend_cpu = ggml_backend_cpu_init();
10002
10535
  if (ctx->backend_cpu == nullptr) {
@@ -10844,22 +11377,24 @@ struct llama_batch llama_batch_get_one(
10844
11377
  };
10845
11378
  }
10846
11379
 
10847
- struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
11380
+ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
10848
11381
  llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
10849
11382
 
10850
11383
  if (embd) {
10851
- batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
11384
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
10852
11385
  } else {
10853
- batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
11386
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
10854
11387
  }
10855
11388
 
10856
- batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
10857
- batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
10858
- batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
10859
- for (int i = 0; i < n_tokens; ++i) {
11389
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
11390
+ batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
11391
+ batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
11392
+ for (int i = 0; i < n_tokens_alloc; ++i) {
10860
11393
  batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
10861
11394
  }
10862
- batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
11395
+ batch.seq_id[n_tokens_alloc] = nullptr;
11396
+
11397
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
10863
11398
 
10864
11399
  return batch;
10865
11400
  }
@@ -10870,7 +11405,7 @@ void llama_batch_free(struct llama_batch batch) {
10870
11405
  if (batch.pos) free(batch.pos);
10871
11406
  if (batch.n_seq_id) free(batch.n_seq_id);
10872
11407
  if (batch.seq_id) {
10873
- for (int i = 0; i < batch.n_tokens; ++i) {
11408
+ for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
10874
11409
  free(batch.seq_id[i]);
10875
11410
  }
10876
11411
  free(batch.seq_id);