@fugood/llama.node 0.0.1-alpha.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CMakeLists.txt +36 -7
  2. package/README.md +9 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/lib/binding.js +18 -1
  14. package/lib/binding.ts +22 -2
  15. package/lib/index.ts +2 -2
  16. package/package.json +15 -3
  17. package/src/LlamaCompletionWorker.cpp +5 -1
  18. package/src/LlamaCompletionWorker.h +4 -0
  19. package/src/LlamaContext.cpp +18 -1
  20. package/src/common.hpp +11 -7
  21. package/src/llama.cpp/CMakeLists.txt +13 -7
  22. package/src/llama.cpp/common/common.cpp +221 -173
  23. package/src/llama.cpp/common/common.h +19 -8
  24. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  25. package/src/llama.cpp/common/log.h +2 -2
  26. package/src/llama.cpp/common/sampling.cpp +17 -1
  27. package/src/llama.cpp/common/sampling.h +28 -20
  28. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  29. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  30. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  31. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  32. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  33. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  34. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  36. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  37. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  38. package/src/llama.cpp/examples/main/main.cpp +10 -8
  39. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  40. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  41. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  42. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  43. package/src/llama.cpp/examples/server/server.cpp +97 -86
  44. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  45. package/src/llama.cpp/ggml-backend.c +7 -5
  46. package/src/llama.cpp/ggml-impl.h +339 -4
  47. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  48. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  49. package/src/llama.cpp/ggml-quants.c +302 -293
  50. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  51. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  52. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  53. package/src/llama.cpp/ggml.c +1469 -116
  54. package/src/llama.cpp/ggml.h +37 -7
  55. package/src/llama.cpp/llama.cpp +969 -432
  56. package/src/llama.cpp/llama.h +46 -14
  57. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  58. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  59. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  60. package/src/llama.cpp/requirements.txt +1 -0
  61. package/src/llama.cpp/sgemm.cpp +134 -103
  62. package/src/llama.cpp/sgemm.h +4 -2
  63. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  64. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  65. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  66. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  67. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  68. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  69. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  70. package/src/llama.cpp/unicode-data.cpp +1188 -656
  71. package/src/llama.cpp/unicode-data.h +4 -3
  72. package/src/llama.cpp/unicode.cpp +590 -49
  73. package/src/llama.cpp/unicode.h +6 -3
  74. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  75. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
@@ -75,6 +75,7 @@
75
75
  #include <forward_list>
76
76
  #include <fstream>
77
77
  #include <functional>
78
+ #include <future>
78
79
  #include <initializer_list>
79
80
  #include <locale>
80
81
  #include <map>
@@ -107,7 +108,6 @@
107
108
  #define LLAMA_MAX_NODES 8192
108
109
  #define LLAMA_MAX_EXPERTS 60
109
110
 
110
-
111
111
  //
112
112
  // logging
113
113
  //
@@ -211,6 +211,7 @@ enum llm_arch {
211
211
  LLM_ARCH_QWEN2,
212
212
  LLM_ARCH_QWEN2MOE,
213
213
  LLM_ARCH_PHI2,
214
+ LLM_ARCH_PHI3,
214
215
  LLM_ARCH_PLAMO,
215
216
  LLM_ARCH_CODESHELL,
216
217
  LLM_ARCH_ORION,
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
246
247
  { LLM_ARCH_QWEN2, "qwen2" },
247
248
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
248
249
  { LLM_ARCH_PHI2, "phi2" },
250
+ { LLM_ARCH_PHI3, "phi3" },
249
251
  { LLM_ARCH_PLAMO, "plamo" },
250
252
  { LLM_ARCH_CODESHELL, "codeshell" },
251
253
  { LLM_ARCH_ORION, "orion" },
@@ -314,6 +316,7 @@ enum llm_kv {
314
316
  LLM_KV_SSM_TIME_STEP_RANK,
315
317
 
316
318
  LLM_KV_TOKENIZER_MODEL,
319
+ LLM_KV_TOKENIZER_PRE,
317
320
  LLM_KV_TOKENIZER_LIST,
318
321
  LLM_KV_TOKENIZER_TOKEN_TYPE,
319
322
  LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@@ -390,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
390
393
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
391
394
 
392
395
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
396
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
393
397
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
394
398
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
395
399
  { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
@@ -793,6 +797,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
793
797
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
794
798
  },
795
799
  },
800
+ {
801
+ LLM_ARCH_PHI3,
802
+ {
803
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
804
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
805
+ { LLM_TENSOR_OUTPUT, "output" },
806
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
807
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
808
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
809
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
810
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
811
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
812
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
813
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
814
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
815
+ },
816
+ },
796
817
  {
797
818
  LLM_ARCH_PLAMO,
798
819
  {
@@ -1824,7 +1845,7 @@ struct llama_hparams {
1824
1845
  float f_logit_scale = 0.0f;
1825
1846
 
1826
1847
  bool causal_attn = true;
1827
- bool need_kq_pos = false;
1848
+ bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1828
1849
 
1829
1850
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1830
1851
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1914,6 +1935,7 @@ struct llama_cparams {
1914
1935
  bool embeddings;
1915
1936
  bool causal_attn;
1916
1937
  bool offload_kqv;
1938
+ bool flash_attn;
1917
1939
 
1918
1940
  enum llama_pooling_type pooling_type;
1919
1941
 
@@ -2017,8 +2039,8 @@ struct llama_kv_cache {
2017
2039
  bool has_shift = false;
2018
2040
  bool do_defrag = false;
2019
2041
  bool do_copy = false;
2020
- // with recurrent state models, a cell can hold the state for more than one past token
2021
- bool recurrent = false;
2042
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
2043
+ bool v_trans = true; // the value tensor is transposed
2022
2044
 
2023
2045
  // Note: The value of head isn't only used to optimize searching
2024
2046
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2095,7 +2117,8 @@ struct llama_vocab {
2095
2117
  ttype type;
2096
2118
  };
2097
2119
 
2098
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2120
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2121
+ enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2099
2122
 
2100
2123
  std::unordered_map<token, id> token_to_id;
2101
2124
  std::vector<token_data> id_to_token;
@@ -2316,11 +2339,14 @@ struct llama_context {
2316
2339
 
2317
2340
  static bool llama_kv_cache_init(
2318
2341
  struct llama_kv_cache & cache,
2319
- const llama_model & model,
2342
+ const llama_context * ctx,
2320
2343
  ggml_type type_k,
2321
2344
  ggml_type type_v,
2322
2345
  uint32_t kv_size,
2323
2346
  bool offload) {
2347
+ const llama_model & model = ctx->model;
2348
+ const llama_cparams & cparams = ctx->cparams;
2349
+
2324
2350
  const struct llama_hparams & hparams = model.hparams;
2325
2351
 
2326
2352
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
@@ -2331,8 +2357,9 @@ static bool llama_kv_cache_init(
2331
2357
 
2332
2358
  // TODO: find a nicer way to add other recurrent model architectures
2333
2359
  cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2360
+ cache.v_trans = !cparams.flash_attn;
2334
2361
 
2335
- // TODO: support mixed reccurent Transformer architectues
2362
+ // TODO: support mixed recurrent Transformer architectures
2336
2363
  // NOTE: (!a || b) is a logical implication (a -> b)
2337
2364
  GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2338
2365
  GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
@@ -2543,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2543
2570
  }
2544
2571
  cache.head = 0;
2545
2572
  cache.used = 0;
2573
+
2574
+ for (auto & buf : cache.bufs) {
2575
+ ggml_backend_buffer_clear(buf, 0);
2576
+ }
2546
2577
  }
2547
2578
 
2548
2579
  static bool llama_kv_cache_seq_rm(
@@ -2863,6 +2894,7 @@ namespace GGUFMeta {
2863
2894
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
2864
2895
  case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
2865
2896
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
2897
+ case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
2866
2898
  }
2867
2899
  return "unknown";
2868
2900
  }
@@ -2874,13 +2906,16 @@ namespace GGUFMeta {
2874
2906
  __func__, override_type_to_str(ovrd->tag), ovrd->key);
2875
2907
  switch (ovrd->tag) {
2876
2908
  case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
2877
- LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
2909
+ LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
2878
2910
  } break;
2879
2911
  case LLAMA_KV_OVERRIDE_TYPE_INT: {
2880
- LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
2912
+ LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
2881
2913
  } break;
2882
2914
  case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
2883
- LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
2915
+ LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
2916
+ } break;
2917
+ case LLAMA_KV_OVERRIDE_TYPE_STR: {
2918
+ LLAMA_LOG_INFO("%s\n", ovrd->val_str);
2884
2919
  } break;
2885
2920
  default:
2886
2921
  // Shouldn't be possible to end up here, but just in case...
@@ -2899,7 +2934,7 @@ namespace GGUFMeta {
2899
2934
  static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
2900
2935
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2901
2936
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
2902
- target = ovrd->bool_value;
2937
+ target = ovrd->val_bool;
2903
2938
  return true;
2904
2939
  }
2905
2940
  return false;
@@ -2909,7 +2944,7 @@ namespace GGUFMeta {
2909
2944
  static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
2910
2945
  try_override(OT & target, const struct llama_model_kv_override * ovrd) {
2911
2946
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
2912
- target = ovrd->int_value;
2947
+ target = ovrd->val_i64;
2913
2948
  return true;
2914
2949
  }
2915
2950
  return false;
@@ -2919,7 +2954,7 @@ namespace GGUFMeta {
2919
2954
  static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
2920
2955
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2921
2956
  if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
2922
- target = ovrd->float_value;
2957
+ target = ovrd->val_f64;
2923
2958
  return true;
2924
2959
  }
2925
2960
  return false;
@@ -2928,12 +2963,11 @@ namespace GGUFMeta {
2928
2963
  template<typename OT>
2929
2964
  static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
2930
2965
  try_override(T & target, const struct llama_model_kv_override * ovrd) {
2931
- (void)target;
2932
- (void)ovrd;
2933
- if (!ovrd) { return false; }
2934
- // Currently, we should never end up here so it would be a bug if we do.
2935
- throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
2936
- ovrd ? ovrd->key : "NULL"));
2966
+ if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
2967
+ target = ovrd->val_str;
2968
+ return true;
2969
+ }
2970
+ return false;
2937
2971
  }
2938
2972
 
2939
2973
  static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
@@ -2966,6 +3000,7 @@ struct llama_model_loader {
2966
3000
  size_t n_bytes = 0;
2967
3001
 
2968
3002
  bool use_mmap = false;
3003
+ bool check_tensors;
2969
3004
 
2970
3005
  llama_files files;
2971
3006
  llama_ftype ftype;
@@ -2980,9 +3015,13 @@ struct llama_model_loader {
2980
3015
 
2981
3016
  ggml_tensor * tensor;
2982
3017
 
2983
- llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
3018
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2984
3019
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2985
3020
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
3021
+
3022
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
3023
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
3024
+ }
2986
3025
  }
2987
3026
  };
2988
3027
  std::vector<llama_tensor_weight> weights;
@@ -2995,7 +3034,7 @@ struct llama_model_loader {
2995
3034
  std::string arch_name;
2996
3035
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2997
3036
 
2998
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
3037
+ llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
2999
3038
  int trace = 0;
3000
3039
  if (getenv("LLAMA_TRACE")) {
3001
3040
  trace = atoi(getenv("LLAMA_TRACE"));
@@ -3021,15 +3060,15 @@ struct llama_model_loader {
3021
3060
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
3022
3061
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
3023
3062
 
3063
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
3064
+ contexts.emplace_back(ctx);
3065
+
3024
3066
  // Save tensors data offset of the main file.
3025
3067
  // For subsidiary files, `meta` tensor data offset must not be used,
3026
3068
  // so we build a unified tensors index for weights.
3027
3069
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3028
- weights.emplace_back(0, cur->name, meta, cur);
3070
+ weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
3029
3071
  }
3030
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
3031
- contexts.emplace_back(ctx);
3032
-
3033
3072
  uint16_t n_split = 0;
3034
3073
  get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
3035
3074
 
@@ -3063,12 +3102,13 @@ struct llama_model_loader {
3063
3102
  throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
3064
3103
  }
3065
3104
 
3105
+ files.emplace_back(new llama_file(split_path, "rb"));
3106
+ contexts.emplace_back(ctx);
3107
+
3066
3108
  // Save tensors data offset info of the shard.
3067
3109
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
3068
- weights.emplace_back(idx, cur->name, ctx_gguf, cur);
3110
+ weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
3069
3111
  }
3070
- files.emplace_back(new llama_file(split_path, "rb"));
3071
- contexts.emplace_back(ctx);
3072
3112
 
3073
3113
  gguf_free(ctx_gguf);
3074
3114
  }
@@ -3091,9 +3131,17 @@ struct llama_model_loader {
3091
3131
 
3092
3132
  fver = (enum llama_fver) gguf_get_version(meta);
3093
3133
 
3134
+ std::set<std::string> tensor_names;
3094
3135
  for (auto & w : weights) {
3095
3136
  n_elements += ggml_nelements(w.tensor);
3096
3137
  n_bytes += ggml_nbytes(w.tensor);
3138
+ // make sure there is no duplicated tensor names
3139
+ const std::string name(w.tensor->name);
3140
+ auto found = tensor_names.find(name);
3141
+ if (found != tensor_names.end()) {
3142
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
3143
+ }
3144
+ tensor_names.insert(name);
3097
3145
  }
3098
3146
 
3099
3147
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -3127,6 +3175,7 @@ struct llama_model_loader {
3127
3175
  switch (type_max) {
3128
3176
  case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
3129
3177
  case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
3178
+ case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
3130
3179
  case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
3131
3180
  case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
3132
3181
  case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@@ -3199,6 +3248,7 @@ struct llama_model_loader {
3199
3248
  }
3200
3249
 
3201
3250
  this->use_mmap = use_mmap;
3251
+ this->check_tensors = check_tensors;
3202
3252
  }
3203
3253
 
3204
3254
  ~llama_model_loader() {
@@ -3278,6 +3328,10 @@ struct llama_model_loader {
3278
3328
  return nullptr;
3279
3329
  }
3280
3330
 
3331
+ const llama_tensor_weight * get_weight(int i) const {
3332
+ return get_weight(get_tensor_name(i));
3333
+ }
3334
+
3281
3335
  const llama_tensor_weight & require_weight(const char * name) const {
3282
3336
  const llama_tensor_weight * weight = get_weight(name);
3283
3337
  if (!weight) {
@@ -3453,6 +3507,10 @@ struct llama_model_loader {
3453
3507
  file->seek(w.offs, SEEK_SET);
3454
3508
  file->read_raw(cur->data, ggml_nbytes(cur));
3455
3509
  }
3510
+
3511
+ if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
3512
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3513
+ }
3456
3514
  }
3457
3515
 
3458
3516
  size_t size_done = 0;
@@ -3469,6 +3527,8 @@ struct llama_model_loader {
3469
3527
  GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3470
3528
 
3471
3529
  std::vector<no_init<uint8_t>> read_buf;
3530
+ std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3531
+
3472
3532
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3473
3533
  const auto * weight = get_weight(ggml_get_name(cur));
3474
3534
  if (weight == nullptr) {
@@ -3490,37 +3550,66 @@ struct llama_model_loader {
3490
3550
  if (bufs_mmap.count(weight->idx)) {
3491
3551
  buf_mmap = bufs_mmap.at(weight->idx);
3492
3552
  }
3553
+ uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
3554
+
3555
+ if (check_tensors) {
3556
+ validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
3557
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
3558
+ }));
3559
+ }
3560
+
3493
3561
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3494
3562
  if (buf_mmap && cur->data == nullptr) {
3495
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3563
+ ggml_backend_tensor_alloc(buf_mmap, cur, data);
3496
3564
  if (lmlocks) {
3497
3565
  const auto & lmlock = lmlocks->at(weight->idx);
3498
- lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3566
+ lmlock->grow_to(weight->offs + n_size);
3499
3567
  }
3500
3568
 
3501
3569
  auto & mmap_used = mmaps_used[weight->idx];
3502
3570
  mmap_used.first = std::min(mmap_used.first, weight->offs);
3503
3571
  mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3504
3572
  } else {
3505
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3573
+ ggml_backend_tensor_set(cur, data, 0, n_size);
3506
3574
  }
3507
3575
  } else {
3508
3576
  GGML_ASSERT(weight->idx < files.size());
3509
3577
  const auto & file = files.at(weight->idx);
3510
3578
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3511
3579
  file->seek(weight->offs, SEEK_SET);
3512
- file->read_raw(cur->data, ggml_nbytes(cur));
3580
+ file->read_raw(cur->data, n_size);
3581
+ if (check_tensors) {
3582
+ validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
3583
+ return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
3584
+ }));
3585
+ }
3513
3586
  } else {
3514
- read_buf.resize(ggml_nbytes(cur));
3587
+ read_buf.resize(n_size);
3515
3588
  file->seek(weight->offs, SEEK_SET);
3516
- file->read_raw(read_buf.data(), ggml_nbytes(cur));
3589
+ file->read_raw(read_buf.data(), n_size);
3517
3590
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3591
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3592
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3593
+ }
3518
3594
  }
3519
3595
  }
3520
3596
 
3521
3597
  size_done += n_size;
3522
3598
  }
3523
3599
 
3600
+ // check validation results
3601
+ bool validation_failed = false;
3602
+ for (auto & future : validation_result) {
3603
+ auto result = future.get();
3604
+ if (!result.second) {
3605
+ LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
3606
+ validation_failed = true;
3607
+ }
3608
+ }
3609
+ if (validation_failed) {
3610
+ throw std::runtime_error("found tensors with invalid data");
3611
+ }
3612
+
3524
3613
  // check if this is the last call and do final cleanup
3525
3614
  if (size_done >= size_data) {
3526
3615
  // unmap offloaded tensors and metadata
@@ -3578,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3578
3667
  switch (ftype) {
3579
3668
  case LLAMA_FTYPE_ALL_F32: return "all F32";
3580
3669
  case LLAMA_FTYPE_MOSTLY_F16: return "F16";
3670
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
3581
3671
  case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
3582
3672
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
3583
3673
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -3955,6 +4045,16 @@ static void llm_load_hparams(
3955
4045
  {
3956
4046
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3957
4047
 
4048
+ switch (hparams.n_layer) {
4049
+ case 24: model.type = e_model::MODEL_1B; break;
4050
+ case 32: model.type = e_model::MODEL_3B; break;
4051
+ default: model.type = e_model::MODEL_UNKNOWN;
4052
+ }
4053
+ } break;
4054
+ case LLM_ARCH_PHI3:
4055
+ {
4056
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4057
+
3958
4058
  switch (hparams.n_layer) {
3959
4059
  case 24: model.type = e_model::MODEL_1B; break;
3960
4060
  case 32: model.type = e_model::MODEL_3B; break;
@@ -4104,7 +4204,7 @@ static void llm_load_hparams(
4104
4204
  model.ftype = ml.ftype;
4105
4205
 
4106
4206
  if (hparams.f_max_alibi_bias > 0.0f) {
4107
- hparams.need_kq_pos = true;
4207
+ hparams.use_alibi = true;
4108
4208
  }
4109
4209
 
4110
4210
  hparams.rope_type = llama_rope_type(&model);
@@ -4127,11 +4227,13 @@ static void llm_load_vocab(
4127
4227
 
4128
4228
  // determine vocab type
4129
4229
  {
4130
- std::string tokenizer_name;
4230
+ std::string tokenizer_model;
4231
+ std::string tokenizer_pre;
4131
4232
 
4132
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
4233
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
4234
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
4133
4235
 
4134
- if (tokenizer_name == "no_vocab") {
4236
+ if (tokenizer_model == "no_vocab") {
4135
4237
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
4136
4238
 
4137
4239
  // default special tokens
@@ -4145,7 +4247,7 @@ static void llm_load_vocab(
4145
4247
  vocab.linefeed_id = -1;
4146
4248
 
4147
4249
  return;
4148
- } else if (tokenizer_name == "llama") {
4250
+ } else if (tokenizer_model == "llama") {
4149
4251
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4150
4252
 
4151
4253
  // default special tokens
@@ -4190,9 +4292,27 @@ static void llm_load_vocab(
4190
4292
  if (add_space_prefix_keyidx != -1) {
4191
4293
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4192
4294
  } // The default value of add_space_prefix is true.
4193
- } else if (tokenizer_name == "gpt2") {
4194
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4295
+ } else if (tokenizer_model == "bert") {
4296
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
4195
4297
 
4298
+ // default special tokens
4299
+ vocab.special_bos_id = -1;
4300
+ vocab.special_eos_id = -1;
4301
+ vocab.special_unk_id = 100;
4302
+ vocab.special_sep_id = 102;
4303
+ vocab.special_pad_id = 0;
4304
+ vocab.special_cls_id = 101;
4305
+ vocab.special_mask_id = 103;
4306
+ vocab.add_space_prefix = false;
4307
+ } else {
4308
+ if (tokenizer_model == "gpt2") {
4309
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4310
+ } else {
4311
+ LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4312
+ LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4313
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
4314
+ return;
4315
+ }
4196
4316
  // read bpe merges and populate bpe ranks
4197
4317
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4198
4318
  if (merges_keyidx == -1) {
@@ -4226,23 +4346,65 @@ static void llm_load_vocab(
4226
4346
  vocab.special_pad_id = -1;
4227
4347
  vocab.special_cls_id = -1;
4228
4348
  vocab.special_mask_id = -1;
4229
- } else if (tokenizer_name == "bert") {
4230
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
4349
+ }
4231
4350
 
4232
- // default special tokens
4233
- vocab.special_bos_id = -1;
4234
- vocab.special_eos_id = -1;
4235
- vocab.special_unk_id = 100;
4236
- vocab.special_sep_id = 102;
4237
- vocab.special_pad_id = 0;
4238
- vocab.special_cls_id = 101;
4239
- vocab.special_mask_id = 103;
4240
- vocab.add_space_prefix = false;
4351
+ // for now, only BPE models have pre-tokenizers
4352
+ if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
4353
+ if (tokenizer_pre.empty()) {
4354
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
4355
+ LLAMA_LOG_WARN("%s: \n", __func__);
4356
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4357
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
4358
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
4359
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
4360
+ LLAMA_LOG_WARN("%s: \n", __func__);
4361
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4362
+ } else if (
4363
+ tokenizer_pre == "default") {
4364
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4365
+ } else if (
4366
+ tokenizer_pre == "llama3" ||
4367
+ tokenizer_pre == "llama-v3" ||
4368
+ tokenizer_pre == "llama-bpe") {
4369
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4370
+ } else if (
4371
+ tokenizer_pre == "deepseek-llm") {
4372
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
4373
+ } else if (
4374
+ tokenizer_pre == "deepseek-coder") {
4375
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
4376
+ } else if (
4377
+ tokenizer_pre == "falcon") {
4378
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4379
+ } else if (
4380
+ tokenizer_pre == "mpt") {
4381
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4382
+ } else if (
4383
+ tokenizer_pre == "starcoder") {
4384
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
+ } else if (
4386
+ tokenizer_pre == "gpt-2") {
4387
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
+ } else if (
4389
+ tokenizer_pre == "refact") {
4390
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
4391
+ } else if (
4392
+ tokenizer_pre == "command-r") {
4393
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
4394
+ } else if (
4395
+ tokenizer_pre == "qwen2") {
4396
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4397
+ } else if (
4398
+ tokenizer_pre == "olmo") {
4399
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4400
+ } else if (
4401
+ tokenizer_pre == "dbrx") {
4402
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4403
+ } else {
4404
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4405
+ }
4241
4406
  } else {
4242
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
4243
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4244
-
4245
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4407
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4246
4408
  }
4247
4409
  }
4248
4410
 
@@ -4352,6 +4514,7 @@ static void llm_load_vocab(
4352
4514
  //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
4353
4515
  (t.first == "<|eot_id|>" ||
4354
4516
  t.first == "<|im_end|>" ||
4517
+ t.first == "<|end|>" ||
4355
4518
  t.first == "<end_of_turn>"
4356
4519
  )
4357
4520
  ) {
@@ -5375,6 +5538,33 @@ static bool llm_load_tensors(
5375
5538
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5376
5539
  }
5377
5540
  } break;
5541
+ case LLM_ARCH_PHI3:
5542
+ {
5543
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
5544
+
5545
+ // output
5546
+ {
5547
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
5548
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
5549
+ }
5550
+
5551
+ for (int i = 0; i < n_layer; ++i) {
5552
+ ggml_context* ctx_layer = ctx_for_layer(i);
5553
+ ggml_context* ctx_split = ctx_for_layer_split(i);
5554
+
5555
+ auto& layer = model.layers[i];
5556
+
5557
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5558
+
5559
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5560
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5561
+
5562
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5563
+
5564
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5565
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5566
+ }
5567
+ } break;
5378
5568
  case LLM_ARCH_PLAMO:
5379
5569
  {
5380
5570
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5909,7 +6099,7 @@ static bool llm_load_tensors(
5909
6099
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
5910
6100
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
5911
6101
  try {
5912
- llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
6102
+ llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
5913
6103
 
5914
6104
  model.hparams.vocab_only = params.vocab_only;
5915
6105
 
@@ -5947,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
5947
6137
  || !(
5948
6138
  model.ftype == LLAMA_FTYPE_ALL_F32 ||
5949
6139
  model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
6140
+ model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
5950
6141
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
5951
6142
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
5952
6143
  )
@@ -6038,37 +6229,47 @@ static struct ggml_tensor * llm_build_inp_embd(
6038
6229
  static void llm_build_kv_store(
6039
6230
  struct ggml_context * ctx,
6040
6231
  const llama_hparams & hparams,
6232
+ const llama_cparams & cparams,
6041
6233
  const llama_kv_cache & kv,
6042
6234
  struct ggml_cgraph * graph,
6043
6235
  struct ggml_tensor * k_cur,
6044
6236
  struct ggml_tensor * v_cur,
6045
- int64_t n_ctx,
6046
6237
  int32_t n_tokens,
6047
6238
  int32_t kv_head,
6048
6239
  const llm_build_cb & cb,
6049
6240
  int64_t il) {
6241
+ const int64_t n_ctx = cparams.n_ctx;
6242
+
6050
6243
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6051
6244
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6052
6245
 
6053
6246
  GGML_ASSERT(kv.size == n_ctx);
6054
6247
 
6055
- // compute the transposed [n_tokens, n_embd] V matrix
6056
- assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6057
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
6058
- cb(v_cur_t, "v_cur_t", il);
6059
-
6060
6248
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
6061
6249
  (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
6062
6250
  cb(k_cache_view, "k_cache_view", il);
6063
6251
 
6064
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6065
- ( n_ctx)*ggml_element_size(kv.v_l[il]),
6066
- (kv_head)*ggml_element_size(kv.v_l[il]));
6252
+ // note: storing RoPE-ed version of K in the KV cache
6253
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6254
+
6255
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
6256
+
6257
+ struct ggml_tensor * v_cache_view = nullptr;
6258
+
6259
+ if (cparams.flash_attn) {
6260
+ v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
6261
+ (kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
6262
+ } else {
6263
+ // note: the V cache is transposed when not using flash attention
6264
+ v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
6265
+ ( n_ctx)*ggml_element_size(kv.v_l[il]),
6266
+ (kv_head)*ggml_element_size(kv.v_l[il]));
6267
+
6268
+ v_cur = ggml_transpose(ctx, v_cur);
6269
+ }
6067
6270
  cb(v_cache_view, "v_cache_view", il);
6068
6271
 
6069
- // important: storing RoPE-ed version of K in the KV cache!
6070
- ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
6071
- ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
6272
+ ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
6072
6273
  }
6073
6274
 
6074
6275
  static struct ggml_tensor * llm_build_norm(
@@ -6288,11 +6489,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
6288
6489
  return moe_out;
6289
6490
  }
6290
6491
 
6291
- // if max_alibi_bias > 0 then apply ALiBi
6292
6492
  static struct ggml_tensor * llm_build_kqv(
6293
6493
  struct ggml_context * ctx,
6294
6494
  const llama_model & model,
6295
6495
  const llama_hparams & hparams,
6496
+ const llama_cparams & cparams,
6296
6497
  const llama_kv_cache & kv,
6297
6498
  struct ggml_cgraph * graph,
6298
6499
  struct ggml_tensor * wo,
@@ -6300,12 +6501,12 @@ static struct ggml_tensor * llm_build_kqv(
6300
6501
  struct ggml_tensor * q_cur,
6301
6502
  struct ggml_tensor * kq_mask,
6302
6503
  struct ggml_tensor * kq_pos,
6303
- int64_t n_ctx,
6304
6504
  int32_t n_tokens,
6305
6505
  int32_t n_kv,
6306
6506
  float kq_scale,
6307
6507
  const llm_build_cb & cb,
6308
6508
  int il) {
6509
+ const int64_t n_ctx = cparams.n_ctx;
6309
6510
  const int64_t n_head = hparams.n_head;
6310
6511
  const int64_t n_head_kv = hparams.n_head_kv;
6311
6512
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -6323,71 +6524,99 @@ static struct ggml_tensor * llm_build_kqv(
6323
6524
  0);
6324
6525
  cb(k, "k", il);
6325
6526
 
6326
- struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6327
- cb(kq, "kq", il);
6527
+ struct ggml_tensor * cur;
6328
6528
 
6329
- if (model.arch == LLM_ARCH_PHI2) {
6330
- // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6331
- // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6332
- ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6333
- }
6529
+ if (cparams.flash_attn) {
6530
+ GGML_UNUSED(model);
6531
+ GGML_UNUSED(n_ctx);
6334
6532
 
6335
- if (model.arch == LLM_ARCH_GROK) {
6336
- // need to do the following:
6337
- // multiply by attn_output_multiplyer of 0.08838834764831845
6338
- // and then :
6339
- // kq = 30 * tanh(kq / 30)
6340
- // before the softmax below
6533
+ // note: if this assert triggers, then some check has failed earlier
6534
+ // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
+ GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6341
6536
 
6342
- //try from phi2
6343
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6537
+ // split cached v into n_head heads (not transposed)
6538
+ struct ggml_tensor * v =
6539
+ ggml_view_3d(ctx, kv.v_l[il],
6540
+ n_embd_head_v, n_kv, n_head_kv,
6541
+ ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6542
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6543
+ 0);
6544
+ cb(v, "v", il);
6344
6545
 
6345
- kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6346
- kq = ggml_scale(ctx, kq, 30);
6347
- }
6546
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6547
+
6548
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6549
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6550
+ }
6551
+
6552
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6553
+ } else {
6554
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6555
+ cb(kq, "kq", il);
6556
+
6557
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6558
+ // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
6559
+ // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
6560
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6561
+ }
6562
+
6563
+ if (model.arch == LLM_ARCH_GROK) {
6564
+ // need to do the following:
6565
+ // multiply by attn_output_multiplyer of 0.08838834764831845
6566
+ // and then :
6567
+ // kq = 30 * tanh(kq / 30)
6568
+ // before the softmax below
6569
+
6570
+ //try from phi2
6571
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
6572
+
6573
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
6574
+ kq = ggml_scale(ctx, kq, 30);
6575
+ }
6348
6576
 
6349
6577
  #if defined(GGML_USE_KOMPUTE)
6350
6578
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6351
6579
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6352
6580
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6353
- if (hparams.f_max_alibi_bias > 0.0f) {
6354
- kq = ggml_scale(ctx, kq, kq_scale);
6355
- cb(kq, "kq_scaled", il);
6581
+ if (hparams.use_alibi) {
6582
+ kq = ggml_scale(ctx, kq, kq_scale);
6583
+ cb(kq, "kq_scaled", il);
6356
6584
 
6357
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6358
- cb(kq, "kq_scaled_alibi", il);
6585
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
+ cb(kq, "kq_scaled_alibi", il);
6359
6587
 
6360
- kq = ggml_add(ctx, kq, kq_mask);
6361
- cb(kq, "kq_masked", il);
6588
+ kq = ggml_add(ctx, kq, kq_mask);
6589
+ cb(kq, "kq_masked", il);
6362
6590
 
6363
- kq = ggml_soft_max(ctx, kq);
6364
- cb(kq, "kq_soft_max", il);
6365
- } else
6591
+ kq = ggml_soft_max(ctx, kq);
6592
+ cb(kq, "kq_soft_max", il);
6593
+ } else
6366
6594
  #endif
6367
- {
6368
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6369
- cb(kq, "kq_soft_max_ext", il);
6370
- }
6595
+ {
6596
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
+ cb(kq, "kq_soft_max_ext", il);
6598
+ }
6371
6599
 
6372
- GGML_ASSERT(kv.size == n_ctx);
6600
+ GGML_ASSERT(kv.size == n_ctx);
6373
6601
 
6374
- // split cached v into n_head heads
6375
- struct ggml_tensor * v =
6376
- ggml_view_3d(ctx, kv.v_l[il],
6377
- n_kv, n_embd_head_v, n_head_kv,
6378
- ggml_element_size(kv.v_l[il])*n_ctx,
6379
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6380
- 0);
6381
- cb(v, "v", il);
6602
+ // split cached v into n_head heads
6603
+ struct ggml_tensor * v =
6604
+ ggml_view_3d(ctx, kv.v_l[il],
6605
+ n_kv, n_embd_head_v, n_head_kv,
6606
+ ggml_element_size(kv.v_l[il])*n_ctx,
6607
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
6608
+ 0);
6609
+ cb(v, "v", il);
6382
6610
 
6383
- struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6384
- cb(kqv, "kqv", il);
6611
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
6612
+ cb(kqv, "kqv", il);
6385
6613
 
6386
- struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6387
- cb(kqv_merged, "kqv_merged", il);
6614
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6615
+ cb(kqv_merged, "kqv_merged", il);
6388
6616
 
6389
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6390
- cb(cur, "kqv_merged_cont", il);
6617
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6618
+ cb(cur, "kqv_merged_cont", il);
6619
+ }
6391
6620
 
6392
6621
  ggml_build_forward_expand(graph, cur);
6393
6622
 
@@ -6407,6 +6636,7 @@ static struct ggml_tensor * llm_build_kv(
6407
6636
  struct ggml_context * ctx,
6408
6637
  const llama_model & model,
6409
6638
  const llama_hparams & hparams,
6639
+ const llama_cparams & cparams,
6410
6640
  const llama_kv_cache & kv,
6411
6641
  struct ggml_cgraph * graph,
6412
6642
  struct ggml_tensor * wo,
@@ -6416,7 +6646,6 @@ static struct ggml_tensor * llm_build_kv(
6416
6646
  struct ggml_tensor * q_cur,
6417
6647
  struct ggml_tensor * kq_mask,
6418
6648
  struct ggml_tensor * kq_pos,
6419
- int64_t n_ctx,
6420
6649
  int32_t n_tokens,
6421
6650
  int32_t kv_head,
6422
6651
  int32_t n_kv,
@@ -6430,12 +6659,12 @@ static struct ggml_tensor * llm_build_kv(
6430
6659
  ggml_build_forward_expand(graph, k_cur);
6431
6660
  ggml_build_forward_expand(graph, v_cur);
6432
6661
 
6433
- llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
6662
+ llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
6434
6663
 
6435
6664
  struct ggml_tensor * cur;
6436
6665
 
6437
- cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
6438
- q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
6666
+ cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
+ q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6439
6668
  cb(cur, "kqv_out", il);
6440
6669
 
6441
6670
  return cur;
@@ -6477,6 +6706,8 @@ struct llm_build_context {
6477
6706
  const int32_t kv_head; // index of where we store new KV data in the cache
6478
6707
  const int32_t n_orig_ctx;
6479
6708
 
6709
+ const bool flash_attn;
6710
+
6480
6711
  const enum llama_pooling_type pooling_type;
6481
6712
  const enum llama_rope_type rope_type;
6482
6713
 
@@ -6523,6 +6754,7 @@ struct llm_build_context {
6523
6754
  n_outputs (worst_case ? n_tokens : lctx.n_outputs),
6524
6755
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
6525
6756
  n_orig_ctx (cparams.n_yarn_orig_ctx),
6757
+ flash_attn (cparams.flash_attn),
6526
6758
  pooling_type (cparams.pooling_type),
6527
6759
  rope_type (hparams.rope_type),
6528
6760
  cb (cb),
@@ -6637,15 +6869,31 @@ struct llm_build_context {
6637
6869
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
6638
6870
  ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
6639
6871
 
6640
- ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6641
- nm, n_embd_v_gqa,
6642
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6643
- ggml_row_size(kv_self.v_l[il]->type, i));
6872
+ ggml_tensor * view_v_src;
6873
+ ggml_tensor * view_v_dst;
6874
+
6875
+ if (flash_attn) {
6876
+ // NOTE: the V cache is not transposed when using flash attention
6877
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6878
+ n_embd_v_gqa, nm,
6879
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6880
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
6644
6881
 
6645
- ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6646
- nm, n_embd_v_gqa,
6647
- ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6648
- ggml_row_size(kv_self.v_l[il]->type, id));
6882
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6883
+ n_embd_v_gqa, nm,
6884
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
6885
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
6886
+ } else {
6887
+ view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
6888
+ nm, n_embd_v_gqa,
6889
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6890
+ ggml_row_size(kv_self.v_l[il]->type, i));
6891
+
6892
+ view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
6893
+ nm, n_embd_v_gqa,
6894
+ ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
6895
+ ggml_row_size(kv_self.v_l[il]->type, id));
6896
+ }
6649
6897
 
6650
6898
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
6651
6899
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
@@ -6675,20 +6923,26 @@ struct llm_build_context {
6675
6923
 
6676
6924
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
6677
6925
  if (causal) {
6678
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
6926
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6679
6927
  } else {
6680
- lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6928
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
6681
6929
  }
6682
6930
  cb(lctx.inp_KQ_mask, "KQ_mask", -1);
6683
6931
  ggml_set_input(lctx.inp_KQ_mask);
6684
- return lctx.inp_KQ_mask;
6932
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6685
6933
  }
6686
6934
 
6687
- struct ggml_tensor * build_inp_KQ_pos() {
6688
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6935
+ struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
+ if (causal) {
6937
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
+ } else {
6939
+ // TODO: this will be needed for ALiBi-based BERT models
6940
+ // https://github.com/ggerganov/llama.cpp/pull/6826
6941
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
+ }
6689
6943
  cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6690
6944
  ggml_set_input(lctx.inp_KQ_pos);
6691
- return lctx.inp_KQ_pos;
6945
+ return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6692
6946
  }
6693
6947
 
6694
6948
  struct ggml_tensor * build_inp_mean() {
@@ -6794,9 +7048,9 @@ struct llm_build_context {
6794
7048
  );
6795
7049
  cb(Kcur, "Kcur", il);
6796
7050
 
6797
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7051
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6798
7052
  model.layers[il].wo, model.layers[il].bo,
6799
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7053
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6800
7054
  }
6801
7055
 
6802
7056
  if (il == n_layer - 1) {
@@ -6934,9 +7188,9 @@ struct llm_build_context {
6934
7188
  cb(Qcur, "Qcur", il);
6935
7189
  cb(Kcur, "Kcur", il);
6936
7190
 
6937
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7191
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
6938
7192
  model.layers[il].wo, NULL,
6939
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7193
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6940
7194
  }
6941
7195
 
6942
7196
  if (il == n_layer - 1) {
@@ -7041,9 +7295,9 @@ struct llm_build_context {
7041
7295
  ext_factor, attn_factor, beta_fast, beta_slow
7042
7296
  );
7043
7297
  cb(Kcur, "Kcur", il);
7044
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7298
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7045
7299
  model.layers[il].wo, NULL,
7046
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7300
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7047
7301
  }
7048
7302
 
7049
7303
  if (il == n_layer - 1) {
@@ -7161,9 +7415,9 @@ struct llm_build_context {
7161
7415
  );
7162
7416
  cb(Kcur, "Kcur", il);
7163
7417
 
7164
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7418
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7165
7419
  model.layers[il].wo, NULL,
7166
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7420
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7167
7421
  }
7168
7422
 
7169
7423
  if (il == n_layer - 1) {
@@ -7286,9 +7540,9 @@ struct llm_build_context {
7286
7540
  );
7287
7541
  cb(Kcur, "Kcur", il);
7288
7542
 
7289
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7543
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7290
7544
  model.layers[il].wo, model.layers[il].bo,
7291
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7545
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7292
7546
  }
7293
7547
 
7294
7548
  if (il == n_layer - 1) {
@@ -7438,9 +7692,9 @@ struct llm_build_context {
7438
7692
  );
7439
7693
  cb(Kcur, "Kcur", il);
7440
7694
 
7441
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7442
- model.layers[il].wo, NULL,
7443
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7695
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
+ model.layers[il].wo, NULL,
7697
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7444
7698
  }
7445
7699
 
7446
7700
  if (il == n_layer - 1) {
@@ -7550,9 +7804,9 @@ struct llm_build_context {
7550
7804
 
7551
7805
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7552
7806
 
7553
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7807
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7554
7808
  model.layers[il].wo, model.layers[il].bo,
7555
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7809
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7556
7810
  }
7557
7811
 
7558
7812
  if (il == n_layer - 1) {
@@ -7754,9 +8008,9 @@ struct llm_build_context {
7754
8008
  );
7755
8009
  cb(Vcur, "Vcur", il);
7756
8010
 
7757
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8011
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7758
8012
  model.layers[il].wo, model.layers[il].bo,
7759
- Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8013
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7760
8014
  }
7761
8015
 
7762
8016
  if (il == n_layer - 1) {
@@ -7850,9 +8104,9 @@ struct llm_build_context {
7850
8104
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7851
8105
  cb(Qcur, "Qcur", il);
7852
8106
 
7853
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8107
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7854
8108
  model.layers[il].wo, NULL,
7855
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8109
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7856
8110
  }
7857
8111
 
7858
8112
  if (il == n_layer - 1) {
@@ -8143,9 +8397,9 @@ struct llm_build_context {
8143
8397
 
8144
8398
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8145
8399
 
8146
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8400
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8147
8401
  model.layers[il].wo, model.layers[il].bo,
8148
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8402
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8149
8403
  }
8150
8404
 
8151
8405
  if (il == n_layer - 1) {
@@ -8274,14 +8528,15 @@ struct llm_build_context {
8274
8528
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8275
8529
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8276
8530
 
8277
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8278
- model.layers[il].wo, model.layers[il].bo,
8279
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8531
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
+ model.layers[il].wo, model.layers[il].bo,
8533
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8280
8534
  } else {
8281
8535
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8282
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8536
+
8537
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8283
8538
  model.layers[il].wo, model.layers[il].bo,
8284
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8539
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8285
8540
  }
8286
8541
  }
8287
8542
 
@@ -8423,9 +8678,9 @@ struct llm_build_context {
8423
8678
  );
8424
8679
  cb(Kcur, "Kcur", il);
8425
8680
 
8426
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8681
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8427
8682
  model.layers[il].wo, NULL,
8428
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8683
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8429
8684
  }
8430
8685
 
8431
8686
  if (il == n_layer - 1) {
@@ -8541,9 +8796,9 @@ struct llm_build_context {
8541
8796
  );
8542
8797
  cb(Kcur, "Kcur", il);
8543
8798
 
8544
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8799
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8545
8800
  model.layers[il].wo, NULL,
8546
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8801
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8547
8802
  }
8548
8803
 
8549
8804
  if (il == n_layer - 1) {
@@ -8654,9 +8909,9 @@ struct llm_build_context {
8654
8909
  );
8655
8910
  cb(Kcur, "Kcur", il);
8656
8911
 
8657
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8912
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8658
8913
  model.layers[il].wo, model.layers[il].bo,
8659
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8914
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8660
8915
  }
8661
8916
 
8662
8917
  if (il == n_layer - 1) {
@@ -8768,9 +9023,9 @@ struct llm_build_context {
8768
9023
  );
8769
9024
  cb(Kcur, "Kcur", il);
8770
9025
 
8771
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9026
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8772
9027
  model.layers[il].wo, model.layers[il].bo,
8773
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9028
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8774
9029
  }
8775
9030
 
8776
9031
  if (il == n_layer - 1) {
@@ -8923,9 +9178,9 @@ struct llm_build_context {
8923
9178
  );
8924
9179
  cb(Kcur, "Kcur", il);
8925
9180
 
8926
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9181
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8927
9182
  model.layers[il].wo, model.layers[il].bo,
8928
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9183
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8929
9184
  }
8930
9185
 
8931
9186
  if (il == n_layer - 1) {
@@ -8967,12 +9222,140 @@ struct llm_build_context {
8967
9222
 
8968
9223
  cur = ggml_add(ctx0, cur, model.output_b);
8969
9224
  cb(cur, "result_output", -1);
9225
+ ggml_build_forward_expand(gf, cur);
9226
+ return gf;
9227
+ }
9228
+
9229
+ struct ggml_cgraph * build_phi3() {
9230
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9231
+
9232
+ const int64_t n_embd_head = hparams.n_embd_head_v;
9233
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
9234
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9235
+
9236
+ struct ggml_tensor * cur;
9237
+ struct ggml_tensor * inpL;
9238
+
9239
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
9240
+
9241
+ // inp_pos - contains the positions
9242
+ struct ggml_tensor * inp_pos = build_inp_pos();
9243
+
9244
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
9245
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
9246
+
9247
+ for (int il = 0; il < n_layer; ++il) {
9248
+ auto residual = inpL;
9249
+
9250
+ // self-attention
9251
+ {
9252
+ struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9253
+ model.layers[il].attn_norm,
9254
+ NULL,
9255
+ LLM_NORM_RMS, cb, il);
9256
+ cb(attn_norm_output, "attn_norm", il);
9257
+
9258
+ struct ggml_tensor * Qcur = nullptr;
9259
+ struct ggml_tensor * Kcur = nullptr;
9260
+ struct ggml_tensor * Vcur = nullptr;
9261
+
9262
+ if (model.layers[il].wqkv) {
9263
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
9264
+ cb(cur, "wqkv", il);
9265
+
9266
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
9267
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
9268
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9269
+ }
9270
+ else {
9271
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9272
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9273
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9274
+ }
9275
+
9276
+ cb(Qcur, "Qcur", il);
9277
+ cb(Kcur, "Kcur", il);
9278
+ cb(Vcur, "Vcur", il);
9279
+
9280
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9281
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9282
+
9283
+ Qcur = ggml_rope_custom(
9284
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9285
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9286
+ );
9287
+ cb(Qcur, "Qcur", il);
9288
+
9289
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9290
+ cb(Qcur, "Qcur", il);
9291
+
9292
+ Kcur = ggml_rope_custom(
9293
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
9294
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
9295
+ );
9296
+ cb(Kcur, "Kcur", il);
9297
+
9298
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
+ model.layers[il].wo, model.layers[il].bo,
9300
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9301
+ }
9302
+
9303
+ if (il == n_layer - 1) {
9304
+ // skip computing output for unused tokens
9305
+ struct ggml_tensor* inp_out_ids = build_inp_out_ids();
9306
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9307
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
9308
+ }
9309
+
9310
+ cur = ggml_add(ctx0, cur, residual);
9311
+ residual = cur;
9312
+
9313
+ cur = llm_build_norm(ctx0, cur, hparams,
9314
+ model.layers[il].ffn_norm, NULL,
9315
+ LLM_NORM_RMS, cb, il);
9316
+ cb(cur, "ffn_norm", il);
9317
+
9318
+ // FF
9319
+ // special-case: the up and gate tensors are merged into a single tensor
9320
+ // TOOD: support into llm_build_ffn
9321
+ {
9322
+ struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
9323
+ cb(up, "ffn_up", il);
9324
+
9325
+ auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
9326
+ auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
9327
+
9328
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
9329
+ cb(y, "ffn_gate", il);
9330
+
9331
+ auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
9332
+ cb(down, "ffn_down", il);
9333
+
9334
+ cur = down;
9335
+ cb(cur, "ffn_out", il);
9336
+ }
9337
+
9338
+ cur = ggml_add(ctx0, residual, cur);
9339
+ cb(cur, "l_out", il);
9340
+
9341
+ inpL = cur;
9342
+ }
9343
+
9344
+ cur = llm_build_norm(ctx0, inpL, hparams,
9345
+ model.output_norm,
9346
+ NULL,
9347
+ LLM_NORM_RMS, cb, -1);
9348
+ cb(cur, "result_norm", -1);
9349
+
9350
+ cur = ggml_mul_mat(ctx0, model.output, cur);
9351
+ cb(cur, "result_output", -1);
8970
9352
 
8971
9353
  ggml_build_forward_expand(gf, cur);
8972
9354
 
8973
9355
  return gf;
8974
9356
  }
8975
9357
 
9358
+
8976
9359
  struct ggml_cgraph * build_plamo() {
8977
9360
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
8978
9361
 
@@ -9025,9 +9408,9 @@ struct llm_build_context {
9025
9408
  ext_factor, attn_factor, beta_fast, beta_slow);
9026
9409
  cb(Kcur, "Kcur", il);
9027
9410
 
9028
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9411
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9029
9412
  model.layers[il].wo, NULL,
9030
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9413
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9031
9414
  }
9032
9415
  struct ggml_tensor * sa_out = cur;
9033
9416
 
@@ -9128,9 +9511,9 @@ struct llm_build_context {
9128
9511
 
9129
9512
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9130
9513
 
9131
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9514
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9132
9515
  model.layers[il].wo, model.layers[il].bo,
9133
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9516
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9134
9517
  }
9135
9518
 
9136
9519
  if (il == n_layer - 1) {
@@ -9235,9 +9618,9 @@ struct llm_build_context {
9235
9618
  );
9236
9619
  cb(Kcur, "Kcur", il);
9237
9620
 
9238
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9621
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9239
9622
  model.layers[il].wo, model.layers[il].bo,
9240
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9623
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9241
9624
  }
9242
9625
 
9243
9626
  if (il == n_layer - 1) {
@@ -9351,9 +9734,9 @@ struct llm_build_context {
9351
9734
  );
9352
9735
  cb(Kcur, "Kcur", il);
9353
9736
 
9354
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9737
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9355
9738
  model.layers[il].wo, NULL,
9356
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9739
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9357
9740
  }
9358
9741
 
9359
9742
  if (il == n_layer - 1) {
@@ -9468,9 +9851,9 @@ struct llm_build_context {
9468
9851
  );
9469
9852
  cb(Kcur, "Kcur", il);
9470
9853
 
9471
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9854
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9472
9855
  model.layers[il].wo, model.layers[il].bo,
9473
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9856
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9474
9857
  }
9475
9858
 
9476
9859
  if (il == n_layer - 1) {
@@ -9598,9 +9981,9 @@ struct llm_build_context {
9598
9981
  );
9599
9982
  cb(Kcur, "Kcur", il);
9600
9983
 
9601
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9984
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9602
9985
  model.layers[il].wo, model.layers[il].bo,
9603
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9986
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9604
9987
  }
9605
9988
 
9606
9989
  if (il == n_layer - 1) {
@@ -9719,9 +10102,9 @@ struct llm_build_context {
9719
10102
  ext_factor, attn_factor, beta_fast, beta_slow);
9720
10103
  cb(Kcur, "Kcur", il);
9721
10104
 
9722
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10105
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9723
10106
  model.layers[il].wo, NULL,
9724
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10107
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9725
10108
  }
9726
10109
 
9727
10110
  if (il == n_layer - 1) {
@@ -9838,9 +10221,9 @@ struct llm_build_context {
9838
10221
  );
9839
10222
  cb(Kcur, "Kcur", il);
9840
10223
 
9841
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10224
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9842
10225
  model.layers[il].wo, model.layers[il].bo,
9843
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10226
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9844
10227
  }
9845
10228
 
9846
10229
  if (il == n_layer - 1) {
@@ -10128,9 +10511,9 @@ struct llm_build_context {
10128
10511
  );
10129
10512
  cb(Kcur, "Kcur", il);
10130
10513
 
10131
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10514
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10132
10515
  model.layers[il].wo, model.layers[il].bo,
10133
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10516
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10134
10517
  }
10135
10518
 
10136
10519
  if (il == n_layer - 1) {
@@ -10259,9 +10642,9 @@ struct llm_build_context {
10259
10642
  );
10260
10643
  cb(Kcur, "Kcur", il);
10261
10644
 
10262
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10645
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10263
10646
  model.layers[il].wo, nullptr,
10264
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10647
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10265
10648
  }
10266
10649
 
10267
10650
  if (il == n_layer - 1) {
@@ -10474,6 +10857,10 @@ static struct ggml_cgraph * llama_build_graph(
10474
10857
  {
10475
10858
  result = llm.build_phi2();
10476
10859
  } break;
10860
+ case LLM_ARCH_PHI3:
10861
+ {
10862
+ result = llm.build_phi3();
10863
+ } break;
10477
10864
  case LLM_ARCH_PLAMO:
10478
10865
  {
10479
10866
  result = llm.build_plamo();
@@ -10684,7 +11071,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
10684
11071
  }
10685
11072
  }
10686
11073
 
10687
- if (hparams.need_kq_pos) {
11074
+ // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
+ // this allows to process multiple sequences in parallel with ALiBi-based models
11076
+ if (hparams.use_alibi) {
10688
11077
  const int64_t n_kv = kv_self.n;
10689
11078
 
10690
11079
  GGML_ASSERT(lctx.inp_KQ_pos);
@@ -11066,7 +11455,7 @@ static int llama_decode_internal(
11066
11455
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11067
11456
  // after enough generations, the benefit from this heuristic disappears
11068
11457
  // if we start defragmenting the cache, the benefit from this will be more important
11069
- kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
11458
+ kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11070
11459
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11071
11460
  }
11072
11461
  }
@@ -11234,6 +11623,10 @@ static int llama_decode_internal(
11234
11623
  }
11235
11624
  }
11236
11625
 
11626
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
11627
+ // overlap with device computation.
11628
+ ggml_backend_sched_reset(lctx.sched);
11629
+
11237
11630
  return 0;
11238
11631
  }
11239
11632
 
@@ -11259,7 +11652,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
11259
11652
  // each move requires 6*n_layer tensors (see build_defrag)
11260
11653
  // - source view, destination view, copy operation
11261
11654
  // - x2 for keys and values
11262
- const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11655
+ //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
11656
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
11657
+ const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
11263
11658
 
11264
11659
  // determine which KV cells to move where
11265
11660
  //
@@ -11575,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
11575
11970
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11576
11971
  GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
11577
11972
  GGML_ASSERT(llama_is_byte_token(vocab, id));
11578
- const auto& token_data = vocab.id_to_token.at(id);
11973
+ const auto & token_data = vocab.id_to_token.at(id);
11579
11974
  switch (llama_vocab_get_type(vocab)) {
11580
11975
  case LLAMA_VOCAB_TYPE_SPM: {
11581
11976
  auto buf = token_data.text.substr(3, 2);
@@ -11583,7 +11978,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11583
11978
  }
11584
11979
  case LLAMA_VOCAB_TYPE_BPE: {
11585
11980
  GGML_ASSERT(false);
11586
- return unicode_utf8_to_byte(token_data.text);
11981
+ return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
11587
11982
  }
11588
11983
  case LLAMA_VOCAB_TYPE_WPM: {
11589
11984
  GGML_ASSERT(false);
@@ -11805,7 +12200,94 @@ struct llm_tokenizer_bpe {
11805
12200
 
11806
12201
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
11807
12202
  int final_prev_index = -1;
11808
- auto word_collection = bpe_gpt2_preprocess(text);
12203
+
12204
+ std::vector<std::string> word_collection;
12205
+ switch (vocab.type) {
12206
+ case LLAMA_VOCAB_TYPE_BPE:
12207
+ switch (vocab.type_pre) {
12208
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12210
+ word_collection = unicode_regex_split(text, {
12211
+ // original regex from tokenizer.json
12212
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12213
+
12214
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12215
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
+ });
12217
+ break;
12218
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
+ word_collection = unicode_regex_split(text, {
12220
+ "[\r\n]",
12221
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12222
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12223
+ "\\s+$",
12224
+ "[一-龥ࠀ-一가-퟿]+",
12225
+ "\\p{N}+",
12226
+ });
12227
+ break;
12228
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12229
+ word_collection = unicode_regex_split(text, {
12230
+ "[\r\n]",
12231
+ "\\s?\\p{L}+",
12232
+ "\\s?\\p{P}+",
12233
+ "[一-龥ࠀ-一가-퟿]+",
12234
+ "\\p{N}",
12235
+ });
12236
+ break;
12237
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
12238
+ word_collection = unicode_regex_split(text, {
12239
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12240
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
+ "[0-9][0-9][0-9]",
12242
+ });
12243
+ break;
12244
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
12245
+ // TODO: MPT pre-tokenization regexes are unknown
12246
+ // the following are close, but not exact. run the following:
12247
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12248
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12249
+ word_collection = unicode_regex_split(text, {
12250
+ "\\s?\\p{L}+",
12251
+ "\\s?\\p{P}+",
12252
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12253
+ });
12254
+ break;
12255
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12256
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
12257
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
12258
+ word_collection = unicode_regex_split(text, {
12259
+ "\\p{N}",
12260
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12261
+ });
12262
+ break;
12263
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
12264
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
12265
+ word_collection = unicode_regex_split(text, {
12266
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12267
+ });
12268
+ break;
12269
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12270
+ word_collection = unicode_regex_split(text, {
12271
+ // original regex from tokenizer.json
12272
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
12273
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12274
+ });
12275
+ break;
12276
+ default:
12277
+ // default regex for BPE tokenization pre-processing
12278
+ word_collection = unicode_regex_split(text, {
12279
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
12280
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12281
+ "\\p{N}+",
12282
+ "[0-9][0-9][0-9]",
12283
+ });
12284
+ break;
12285
+ }
12286
+ break;
12287
+ default:
12288
+ GGML_ASSERT(false);
12289
+ break;
12290
+ }
11809
12291
 
11810
12292
  symbols_final.clear();
11811
12293
 
@@ -11932,145 +12414,6 @@ private:
11932
12414
  work_queue.push(bigram);
11933
12415
  }
11934
12416
 
11935
- std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
11936
- std::vector<std::string> bpe_words;
11937
- std::vector<std::string> bpe_encoded_words;
11938
-
11939
- std::string token = "";
11940
- // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
11941
- bool collecting_numeric = false;
11942
- bool collecting_letter = false;
11943
- bool collecting_special = false;
11944
- bool collecting_whitespace_lookahead = false;
11945
- bool collecting = false;
11946
-
11947
- std::vector<std::string> text_utf;
11948
- text_utf.reserve(text.size());
11949
- bpe_words.reserve(text.size());
11950
- bpe_encoded_words.reserve(text.size());
11951
-
11952
- const auto cpts = unicode_cpts_from_utf8(text);
11953
- for (size_t i = 0; i < cpts.size(); ++i)
11954
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
11955
-
11956
- for (int i = 0; i < (int)text_utf.size(); i++) {
11957
- const std::string & utf_char = text_utf[i];
11958
- bool split_condition = false;
11959
- int bytes_remain = text_utf.size() - i;
11960
- // forward backward lookups
11961
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
11962
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
11963
-
11964
- // handling contractions
11965
- if (!split_condition && bytes_remain >= 2) {
11966
- // 's|'t|'m|'d
11967
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
11968
- split_condition = true;
11969
- }
11970
- if (split_condition) {
11971
- if (token.size()) {
11972
- bpe_words.emplace_back(token); // push previous content as token
11973
- }
11974
- token = utf_char + utf_char_next;
11975
- bpe_words.emplace_back(token);
11976
- token = "";
11977
- i++;
11978
- continue;
11979
- }
11980
- }
11981
- if (!split_condition && bytes_remain >= 3) {
11982
- // 're|'ve|'ll
11983
- if (utf_char == "\'" && (
11984
- (utf_char_next == "r" && utf_char_next_next == "e") ||
11985
- (utf_char_next == "v" && utf_char_next_next == "e") ||
11986
- (utf_char_next == "l" && utf_char_next_next == "l"))
11987
- ) {
11988
- split_condition = true;
11989
- }
11990
- if (split_condition) {
11991
- // current token + next token can be defined
11992
- if (token.size()) {
11993
- bpe_words.emplace_back(token); // push previous content as token
11994
- }
11995
- token = utf_char + utf_char_next + utf_char_next_next;
11996
- bpe_words.emplace_back(token); // the contraction
11997
- token = "";
11998
- i += 2;
11999
- continue;
12000
- }
12001
- }
12002
-
12003
- if (!split_condition && !collecting) {
12004
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
12005
- collecting_letter = true;
12006
- collecting = true;
12007
- }
12008
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12009
- collecting_numeric = true;
12010
- collecting = true;
12011
- }
12012
- else if (
12013
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
12014
- (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
12015
- ) {
12016
- collecting_special = true;
12017
- collecting = true;
12018
- }
12019
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
12020
- collecting_whitespace_lookahead = true;
12021
- collecting = true;
12022
- }
12023
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
12024
- split_condition = true;
12025
- }
12026
- }
12027
- else if (!split_condition && collecting) {
12028
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
12029
- split_condition = true;
12030
- }
12031
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
12032
- split_condition = true;
12033
- }
12034
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
12035
- split_condition = true;
12036
- }
12037
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
12038
- split_condition = true;
12039
- }
12040
- }
12041
-
12042
- if (utf_char_next == "") {
12043
- split_condition = true; // final
12044
- token += utf_char;
12045
- }
12046
-
12047
- if (split_condition) {
12048
- if (token.size()) {
12049
- bpe_words.emplace_back(token);
12050
- }
12051
- token = utf_char;
12052
- collecting = false;
12053
- collecting_letter = false;
12054
- collecting_numeric = false;
12055
- collecting_special = false;
12056
- collecting_whitespace_lookahead = false;
12057
- }
12058
- else {
12059
- token += utf_char;
12060
- }
12061
- }
12062
-
12063
- for (std::string & word : bpe_words) {
12064
- std::string encoded_token = "";
12065
- for (char & c : word) {
12066
- encoded_token += unicode_byte_to_utf8(c);
12067
- }
12068
- bpe_encoded_words.emplace_back(encoded_token);
12069
- }
12070
-
12071
- return bpe_encoded_words;
12072
- }
12073
-
12074
12417
  const llama_vocab & vocab;
12075
12418
 
12076
12419
  std::vector<llm_symbol> symbols;
@@ -12145,7 +12488,7 @@ struct llm_tokenizer_wpm {
12145
12488
  continue;
12146
12489
  }
12147
12490
  code = unicode_tolower(code);
12148
- if (type == CODEPOINT_TYPE_WHITESPACE) {
12491
+ if (type == CODEPOINT_TYPE_SEPARATOR) {
12149
12492
  code = ' ';
12150
12493
  }
12151
12494
  std::string s = unicode_cpt_to_utf8(code);
@@ -12390,7 +12733,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12390
12733
  } break;
12391
12734
  case LLAMA_VOCAB_TYPE_BPE:
12392
12735
  {
12393
- if (add_special && vocab.special_add_bos == 1) {
12736
+ if (add_special && vocab.special_add_bos != 0) {
12394
12737
  GGML_ASSERT(vocab.special_bos_id != -1);
12395
12738
  output.push_back(vocab.special_bos_id);
12396
12739
  }
@@ -13478,7 +13821,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
13478
13821
  return result;
13479
13822
  }
13480
13823
 
13481
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13824
+ llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
13482
13825
  GGML_ASSERT(ctx);
13483
13826
 
13484
13827
  const int64_t t_start_sample_us = ggml_time_us();
@@ -13491,7 +13834,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13491
13834
  }
13492
13835
 
13493
13836
  std::discrete_distribution<> dist(probs.begin(), probs.end());
13494
- auto & rng = ctx->rng;
13495
13837
  int idx = dist(rng);
13496
13838
 
13497
13839
  llama_token result = candidates->data[idx].id;
@@ -13501,6 +13843,10 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
13501
13843
  return result;
13502
13844
  }
13503
13845
 
13846
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13847
+ return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
13848
+ }
13849
+
13504
13850
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
13505
13851
  const int64_t t_start_sample_us = ggml_time_us();
13506
13852
 
@@ -13829,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
13829
14175
  if (qtype.to_float == NULL) {
13830
14176
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
13831
14177
  }
13832
- } else if (tensor->type != GGML_TYPE_F16) {
14178
+ } else if (tensor->type != GGML_TYPE_F16 &&
14179
+ tensor->type != GGML_TYPE_BF16) {
13833
14180
  throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
13834
14181
  }
13835
14182
 
13836
14183
  if (nthread < 2) {
13837
14184
  if (tensor->type == GGML_TYPE_F16) {
13838
14185
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
14186
+ } else if (tensor->type == GGML_TYPE_BF16) {
14187
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
13839
14188
  } else if (ggml_is_quantized(tensor->type)) {
13840
14189
  qtype.to_float(tensor->data, f32_output, nelements);
13841
14190
  } else {
@@ -13844,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
13844
14193
  return;
13845
14194
  }
13846
14195
 
13847
- size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
14196
+ size_t block_size;
14197
+ if (tensor->type == GGML_TYPE_F16 ||
14198
+ tensor->type == GGML_TYPE_BF16) {
14199
+ block_size = 1;
14200
+ } else {
14201
+ block_size = (size_t)ggml_blck_size(tensor->type);
14202
+ }
14203
+
13848
14204
  size_t block_size_bytes = ggml_type_size(tensor->type);
13849
14205
 
13850
14206
  GGML_ASSERT(nelements % block_size == 0);
@@ -13863,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
13863
14219
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
13864
14220
  if (typ == GGML_TYPE_F16) {
13865
14221
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
14222
+ } else if (typ == GGML_TYPE_BF16) {
14223
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
13866
14224
  } else {
13867
14225
  qtype.to_float(inbuf, outbuf, nels);
13868
14226
  }
@@ -14159,14 +14517,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
14159
14517
  }
14160
14518
 
14161
14519
  static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14162
- std::mutex mutex;
14163
- int64_t counter = 0;
14164
- size_t new_size = 0;
14165
14520
  if (nthread < 2) {
14166
14521
  // single-thread
14167
- return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14522
+ size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
14523
+ if (!ggml_validate_row_data(new_type, new_data, new_size)) {
14524
+ throw std::runtime_error("quantized data validation failed");
14525
+ }
14526
+ return new_size;
14168
14527
  }
14169
- auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
14528
+
14529
+ std::mutex mutex;
14530
+ int64_t counter = 0;
14531
+ size_t new_size = 0;
14532
+ bool valid = true;
14533
+ auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
14170
14534
  nrows, n_per_row, imatrix]() {
14171
14535
  const int64_t nrows_per_chunk = chunk_size / n_per_row;
14172
14536
  size_t local_size = 0;
@@ -14181,7 +14545,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14181
14545
  }
14182
14546
  lock.unlock();
14183
14547
  const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14184
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14548
+ size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
14549
+ local_size += this_size;
14550
+
14551
+ // validate the quantized data
14552
+ const size_t row_size = ggml_row_size(new_type, n_per_row);
14553
+ void * this_data = (char *) new_data + first_row * row_size;
14554
+ if (!ggml_validate_row_data(new_type, this_data, this_size)) {
14555
+ std::unique_lock<std::mutex> lock(mutex);
14556
+ valid = false;
14557
+ break;
14558
+ }
14185
14559
  }
14186
14560
  };
14187
14561
  for (int it = 0; it < nthread - 1; ++it) {
@@ -14190,6 +14564,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
14190
14564
  compute();
14191
14565
  for (auto & w : workers) { w.join(); }
14192
14566
  workers.clear();
14567
+ if (!valid) {
14568
+ throw std::runtime_error("quantized data validation failed");
14569
+ }
14193
14570
  return new_size;
14194
14571
  }
14195
14572
 
@@ -14204,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14204
14581
  case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
14205
14582
  case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
14206
14583
  case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
14584
+ case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
14207
14585
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
14208
14586
 
14209
14587
  // K-quants
@@ -14252,7 +14630,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14252
14630
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
14253
14631
  kv_overrides = v->data();
14254
14632
  }
14255
- llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
14633
+ llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
14256
14634
  ml.init_mappings(false); // no prefetching
14257
14635
 
14258
14636
  llama_model model;
@@ -14290,11 +14668,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14290
14668
  for (auto & o : overrides) {
14291
14669
  if (o.key[0] == 0) break;
14292
14670
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
14293
- gguf_set_val_f32(ctx_out, o.key, o.float_value);
14671
+ gguf_set_val_f32(ctx_out, o.key, o.val_f64);
14294
14672
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
14295
- gguf_set_val_i32(ctx_out, o.key, o.int_value);
14673
+ gguf_set_val_i32(ctx_out, o.key, o.val_i64);
14296
14674
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
14297
- gguf_set_val_bool(ctx_out, o.key, o.bool_value);
14675
+ gguf_set_val_bool(ctx_out, o.key, o.val_bool);
14676
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
14677
+ gguf_set_val_str(ctx_out, o.key, o.val_str);
14298
14678
  } else {
14299
14679
  LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
14300
14680
  }
@@ -14336,26 +14716,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14336
14716
  std::vector<no_init<uint8_t>> work;
14337
14717
  std::vector<no_init<float>> f32_conv_buf;
14338
14718
 
14719
+ uint16_t n_split = 1;
14720
+ // Assume split index is continuous
14721
+ if (params->keep_split) {
14722
+ for (int i = 0; i < ml.n_tensors; ++i) {
14723
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14724
+ }
14725
+ }
14726
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
14727
+ ctx_outs[0] = ctx_out;
14728
+
14339
14729
  // populate the original tensors so we get an initial meta data
14340
14730
  for (int i = 0; i < ml.n_tensors; ++i) {
14341
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
14342
- gguf_add_tensor(ctx_out, meta);
14731
+ auto weight = ml.get_weight(i);
14732
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
14733
+ struct ggml_tensor * tensor = weight->tensor;
14734
+ if (ctx_outs[i_split] == NULL) {
14735
+ ctx_outs[i_split] = gguf_init_empty();
14736
+ }
14737
+ gguf_add_tensor(ctx_outs[i_split], tensor);
14343
14738
  }
14344
14739
 
14345
- std::ofstream fout(fname_out, std::ios::binary);
14346
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14347
-
14348
- const size_t meta_size = gguf_get_meta_size(ctx_out);
14740
+ // Set split info if needed
14741
+ if (n_split > 1) {
14742
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
14743
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14744
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14745
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14746
+ }
14747
+ }
14349
14748
 
14350
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14749
+ int cur_split = -1;
14750
+ std::ofstream fout;
14751
+ auto close_ofstream = [&]() {
14752
+ // Write metadata and close file handler
14753
+ if (fout.is_open()) {
14754
+ fout.seekp(0);
14755
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14756
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
14757
+ fout.write((const char *) data.data(), data.size());
14758
+ fout.close();
14759
+ }
14760
+ };
14761
+ auto new_ofstream = [&](int index) {
14762
+ cur_split = index;
14763
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14764
+ std::string fname = fname_out;
14765
+ if (params->keep_split) {
14766
+ char split_path[PATH_MAX] = {0};
14767
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14768
+ fname = std::string(split_path);
14769
+ }
14351
14770
 
14352
- // placeholder for the meta data
14353
- ::zeros(fout, meta_size);
14771
+ fout = std::ofstream(fname, std::ios::binary);
14772
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14773
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14774
+ // placeholder for the meta data
14775
+ ::zeros(fout, meta_size);
14776
+ };
14354
14777
 
14355
14778
  const auto tn = LLM_TN(model.arch);
14356
-
14779
+ new_ofstream(0);
14357
14780
  for (int i = 0; i < ml.n_tensors; ++i) {
14358
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14781
+ auto weight = ml.get_weight(i);
14782
+ struct ggml_tensor * tensor = weight->tensor;
14783
+ if (weight->idx != cur_split && params->keep_split) {
14784
+ close_ofstream();
14785
+ new_ofstream(weight->idx);
14786
+ }
14359
14787
 
14360
14788
  const std::string name = ggml_get_name(tensor);
14361
14789
 
@@ -14510,26 +14938,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14510
14938
  total_size_new += new_size;
14511
14939
 
14512
14940
  // update the gguf meta data as we go
14513
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
14514
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
14941
+ gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
14942
+ gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
14515
14943
 
14516
14944
  // write tensor data + padding
14517
14945
  fout.write((const char *) new_data, new_size);
14518
14946
  zeros(fout, GGML_PAD(new_size, align) - new_size);
14519
14947
  }
14520
-
14521
- // go back to beginning of file and write the updated meta data
14522
- {
14523
- fout.seekp(0);
14524
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
14525
- gguf_get_meta_data(ctx_out, data.data());
14526
- fout.write((const char *) data.data(), data.size());
14948
+ close_ofstream();
14949
+ for (auto & c:ctx_outs) {
14950
+ gguf_free(c);
14527
14951
  }
14528
14952
 
14529
- fout.close();
14530
-
14531
- gguf_free(ctx_out);
14532
-
14533
14953
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
14534
14954
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
14535
14955
 
@@ -14573,7 +14993,7 @@ static int llama_apply_lora_from_file_internal(
14573
14993
  std::unique_ptr<llama_model_loader> ml;
14574
14994
  if (path_base_model) {
14575
14995
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
14576
- ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
14996
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
14577
14997
  ml->init_mappings(/*prefetch*/ false); // no prefetching
14578
14998
  }
14579
14999
 
@@ -14832,6 +15252,7 @@ struct llama_model_params llama_model_default_params() {
14832
15252
  /*.vocab_only =*/ false,
14833
15253
  /*.use_mmap =*/ true,
14834
15254
  /*.use_mlock =*/ false,
15255
+ /*.check_tensors =*/ false,
14835
15256
  };
14836
15257
 
14837
15258
  #ifdef GGML_USE_METAL
@@ -14868,6 +15289,7 @@ struct llama_context_params llama_context_default_params() {
14868
15289
  /*.logits_all =*/ false,
14869
15290
  /*.embeddings =*/ false,
14870
15291
  /*.offload_kqv =*/ true,
15292
+ /*.flash_attn =*/ false,
14871
15293
  /*.abort_callback =*/ nullptr,
14872
15294
  /*.abort_callback_data =*/ nullptr,
14873
15295
  };
@@ -14885,6 +15307,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
14885
15307
  /*.quantize_output_tensor =*/ true,
14886
15308
  /*.only_copy =*/ false,
14887
15309
  /*.pure =*/ false,
15310
+ /*.keep_split =*/ false,
14888
15311
  /*.imatrix =*/ nullptr,
14889
15312
  /*.kv_overrides =*/ nullptr,
14890
15313
  };
@@ -15033,6 +15456,7 @@ struct llama_context * llama_new_context_with_model(
15033
15456
  cparams.defrag_thold = params.defrag_thold;
15034
15457
  cparams.embeddings = params.embeddings;
15035
15458
  cparams.offload_kqv = params.offload_kqv;
15459
+ cparams.flash_attn = params.flash_attn;
15036
15460
  cparams.pooling_type = params.pooling_type;
15037
15461
 
15038
15462
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -15040,12 +15464,20 @@ struct llama_context * llama_new_context_with_model(
15040
15464
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15041
15465
 
15042
15466
  // this is necessary due to kv_self.n being padded later during inference
15043
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
15467
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15044
15468
 
15045
15469
  // with causal attention, the batch size is limited by the context size
15046
15470
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
15047
- cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15048
15471
 
15472
+ // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
15473
+ // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
15474
+ // ref: https://github.com/ggerganov/llama.cpp/pull/5021
15475
+ if (cparams.n_batch < GGML_KQ_MASK_PAD) {
15476
+ LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
15477
+ cparams.n_batch = GGML_KQ_MASK_PAD;
15478
+ }
15479
+
15480
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
15049
15481
 
15050
15482
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
15051
15483
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
@@ -15077,6 +15509,16 @@ struct llama_context * llama_new_context_with_model(
15077
15509
  }
15078
15510
  }
15079
15511
 
15512
+ if (cparams.flash_attn && hparams.use_alibi) {
15513
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
+ cparams.flash_attn = false;
15515
+ }
15516
+
15517
+ if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
+ cparams.flash_attn = false;
15520
+ }
15521
+
15080
15522
  if (params.seed == LLAMA_DEFAULT_SEED) {
15081
15523
  params.seed = time(NULL);
15082
15524
  }
@@ -15084,6 +15526,7 @@ struct llama_context * llama_new_context_with_model(
15084
15526
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
15085
15527
  LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
15086
15528
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
15529
+ LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
15087
15530
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
15088
15531
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
15089
15532
 
@@ -15212,7 +15655,7 @@ struct llama_context * llama_new_context_with_model(
15212
15655
  }
15213
15656
  ctx->backends.push_back(ctx->backend_cpu);
15214
15657
 
15215
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
15658
+ if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
15216
15659
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
15217
15660
  llama_free(ctx);
15218
15661
  return nullptr;
@@ -15393,6 +15836,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15393
15836
  case LLM_ARCH_QWEN2:
15394
15837
  case LLM_ARCH_QWEN2MOE:
15395
15838
  case LLM_ARCH_PHI2:
15839
+ case LLM_ARCH_PHI3:
15396
15840
  case LLM_ARCH_GEMMA:
15397
15841
  case LLM_ARCH_STARCODER2:
15398
15842
  return LLAMA_ROPE_TYPE_NEOX;
@@ -15406,6 +15850,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15406
15850
  return LLAMA_ROPE_TYPE_NONE;
15407
15851
  }
15408
15852
 
15853
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
15854
+ return ctx->cparams.pooling_type;
15855
+ }
15856
+
15409
15857
  int32_t llama_n_vocab(const struct llama_model * model) {
15410
15858
  return model->hparams.n_vocab;
15411
15859
  }
@@ -15806,6 +16254,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
15806
16254
  const size_t s_kv_head = sizeof(uint32_t);
15807
16255
  const size_t s_kv_size = sizeof(uint32_t);
15808
16256
  const size_t s_kv_used = sizeof(uint32_t);
16257
+ const size_t s_v_trans = sizeof(uint32_t);
15809
16258
  const size_t s_kv = ctx->kv_self.total_size();
15810
16259
  const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
15811
16260
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
@@ -15823,10 +16272,14 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
15823
16272
  + s_kv_head
15824
16273
  + s_kv_size
15825
16274
  + s_kv_used
16275
+ + s_v_trans
15826
16276
  + s_kv
15827
16277
  + s_kv_cells
15828
16278
  );
15829
16279
 
16280
+ // on session change it is very likely that the state size has changed - so we need to update this function
16281
+ static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
16282
+
15830
16283
  return s_total;
15831
16284
  }
15832
16285
 
@@ -15884,6 +16337,8 @@ struct llama_data_file_context : llama_data_context {
15884
16337
  *
15885
16338
  */
15886
16339
  static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
16340
+ llama_synchronize(ctx);
16341
+
15887
16342
  // copy rng
15888
16343
  {
15889
16344
  std::ostringstream rng_ss;
@@ -15970,11 +16425,13 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
15970
16425
  const uint32_t kv_size = kv_self.size;
15971
16426
  const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
15972
16427
  const uint32_t kv_used = kv_self.used;
16428
+ const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
15973
16429
 
15974
16430
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
15975
16431
  data_ctx->write(&kv_head, sizeof(kv_head));
15976
16432
  data_ctx->write(&kv_size, sizeof(kv_size));
15977
16433
  data_ctx->write(&kv_used, sizeof(kv_used));
16434
+ data_ctx->write(&v_trans, sizeof(v_trans));
15978
16435
 
15979
16436
  if (kv_buf_size) {
15980
16437
  const size_t pre_kv_buf_size = data_ctx->get_size_written();
@@ -15987,7 +16444,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
15987
16444
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
15988
16445
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
15989
16446
 
15990
- if (kv_self.recurrent) {
16447
+ if (kv_self.recurrent || !kv_self.v_trans) {
15991
16448
  // v is contiguous for recurrent models
15992
16449
  // TODO: use other tensors for state models than k and v
15993
16450
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16036,6 +16493,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
16036
16493
 
16037
16494
  // Sets the state reading from the specified source address
16038
16495
  size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16496
+ llama_synchronize(ctx);
16497
+
16039
16498
  const uint8_t * inp = src;
16040
16499
 
16041
16500
  // set rng
@@ -16118,11 +16577,15 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16118
16577
  uint32_t kv_head;
16119
16578
  uint32_t kv_size;
16120
16579
  uint32_t kv_used;
16580
+ uint32_t v_trans;
16121
16581
 
16122
16582
  memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
16123
16583
  memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
16124
16584
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
16125
16585
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
16586
+ memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans);
16587
+
16588
+ GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
16126
16589
 
16127
16590
  if (kv_self.size != kv_size) {
16128
16591
  // the KV cache needs to be big enough to load all the KV cells from the saved state
@@ -16132,6 +16595,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16132
16595
  __func__, kv_head, kv_size, kv_self.size);
16133
16596
  }
16134
16597
 
16598
+ llama_kv_cache_clear(ctx);
16599
+
16135
16600
  if (kv_buf_size) {
16136
16601
  const size_t pre_kv_buf_size = inp - src;
16137
16602
 
@@ -16143,7 +16608,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16143
16608
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
16144
16609
  inp += k_size;
16145
16610
 
16146
- if (kv_self.recurrent) {
16611
+ if (kv_self.recurrent || !kv_self.v_trans) {
16147
16612
  // v is contiguous for recurrent models
16148
16613
  // TODO: use other tensors for state models than k and v
16149
16614
  const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
@@ -16165,8 +16630,6 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16165
16630
  GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
16166
16631
  }
16167
16632
 
16168
- llama_kv_cache_clear(ctx);
16169
-
16170
16633
  ctx->kv_self.head = kv_head;
16171
16634
  ctx->kv_self.used = kv_used;
16172
16635
 
@@ -16340,6 +16803,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
16340
16803
  }
16341
16804
 
16342
16805
  static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16806
+ llama_synchronize(ctx);
16807
+
16343
16808
  const auto & kv_self = ctx->kv_self;
16344
16809
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16345
16810
 
@@ -16424,28 +16889,49 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16424
16889
  }
16425
16890
  }
16426
16891
 
16427
- // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16428
- const uint32_t kv_size = kv_self.size;
16429
- for (int il = 0; il < (int)n_layer; ++il) {
16430
- // Write value type
16431
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16432
- data_ctx.write(&v_type_i, sizeof(v_type_i));
16892
+ // TODO: simplify, reduce copy-paste
16893
+ if (!kv_self.v_trans) {
16894
+ for (int il = 0; il < (int)n_layer; ++il) {
16895
+ // Write value type
16896
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16897
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16433
16898
 
16434
- // Write element size
16435
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16436
- data_ctx.write(&v_size_el, sizeof(v_size_el));
16899
+ // Write row size of value
16900
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
16901
+ data_ctx.write(&v_size_row, sizeof(v_size_row));
16437
16902
 
16438
- // For each row, we get the element values of each cell
16439
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16440
- // Read each range of cells of v_size_el length each into tmp_buf and write out
16903
+ // Read each range of cells of v_size length each into tmp_buf and write out
16441
16904
  for (const auto & range : cell_ranges) {
16442
16905
  const size_t range_size = range.second - range.first;
16443
- const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16444
- tmp_buf.resize(range_size * v_size_el);
16445
- ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16906
+ tmp_buf.resize(range_size * v_size_row);
16907
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
16446
16908
  data_ctx.write(tmp_buf.data(), tmp_buf.size());
16447
16909
  }
16448
16910
  }
16911
+ } else {
16912
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16913
+ const uint32_t kv_size = kv_self.size;
16914
+ for (int il = 0; il < (int)n_layer; ++il) {
16915
+ // Write value type
16916
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16917
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16918
+
16919
+ // Write element size
16920
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16921
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16922
+
16923
+ // For each row, we get the element values of each cell
16924
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16925
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16926
+ for (const auto & range : cell_ranges) {
16927
+ const size_t range_size = range.second - range.first;
16928
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16929
+ tmp_buf.resize(range_size * v_size_el);
16930
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16931
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16932
+ }
16933
+ }
16934
+ }
16449
16935
  }
16450
16936
 
16451
16937
  return data_ctx.get_size_written();
@@ -16457,6 +16943,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
16457
16943
  }
16458
16944
 
16459
16945
  size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16946
+ llama_synchronize(ctx);
16947
+
16460
16948
  auto & kv_self = ctx->kv_self;
16461
16949
  GGML_ASSERT(!kv_self.recurrent); // not implemented
16462
16950
 
@@ -16568,41 +17056,75 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
16568
17056
  }
16569
17057
  }
16570
17058
 
16571
- // For each layer, read the values for each cell (transposed)
16572
- for (int il = 0; il < (int)n_layer; ++il) {
16573
- // Read type of value
16574
- int32_t v_type_i_ref;
16575
- memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16576
- inp += sizeof(v_type_i_ref);
16577
- const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16578
- if (v_type_i != v_type_i_ref) {
16579
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16580
- LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16581
- return 0;
16582
- }
17059
+ // TODO: simplify, reduce copy-paste
17060
+ if (!kv_self.v_trans) {
17061
+ for (int il = 0; il < (int)n_layer; ++il) {
17062
+ // Read type of value
17063
+ int32_t v_type_i_ref;
17064
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17065
+ inp += sizeof(v_type_i_ref);
17066
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17067
+ if (v_type_i != v_type_i_ref) {
17068
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17069
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17070
+ return 0;
17071
+ }
16583
17072
 
16584
- // Read element size of value
16585
- size_t v_size_el_ref;
16586
- memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16587
- inp += sizeof(v_size_el_ref);
16588
- const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16589
- if (v_size_el != v_size_el_ref) {
16590
- llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16591
- LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16592
- return 0;
16593
- }
17073
+ // Read row size of value
17074
+ size_t v_size_row_ref;
17075
+ memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
17076
+ inp += sizeof(v_size_row_ref);
17077
+ const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
17078
+ if (v_size_row != v_size_row_ref) {
17079
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17080
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
17081
+ return 0;
17082
+ }
16594
17083
 
16595
- if (cell_count) {
16596
- // For each row in the transposed matrix, read the values for the whole cell range
16597
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16598
- const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16599
- ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16600
- inp += cell_count * v_size_el;
17084
+ if (cell_count) {
17085
+ // Read and set the values for the whole cell range
17086
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
17087
+ inp += cell_count * v_size_row;
17088
+ }
17089
+ }
17090
+ } else {
17091
+ // For each layer, read the values for each cell (transposed)
17092
+ for (int il = 0; il < (int)n_layer; ++il) {
17093
+ // Read type of value
17094
+ int32_t v_type_i_ref;
17095
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
17096
+ inp += sizeof(v_type_i_ref);
17097
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
17098
+ if (v_type_i != v_type_i_ref) {
17099
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17100
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
17101
+ return 0;
17102
+ }
17103
+
17104
+ // Read element size of value
17105
+ size_t v_size_el_ref;
17106
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
17107
+ inp += sizeof(v_size_el_ref);
17108
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
17109
+ if (v_size_el != v_size_el_ref) {
17110
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
17111
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
17112
+ return 0;
17113
+ }
17114
+
17115
+ if (cell_count) {
17116
+ // For each row in the transposed matrix, read the values for the whole cell range
17117
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
17118
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
17119
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
17120
+ inp += cell_count * v_size_el;
17121
+ }
16601
17122
  }
16602
17123
  }
16603
17124
  }
16604
17125
 
16605
17126
  const size_t nread = inp - src;
17127
+
16606
17128
  return nread;
16607
17129
  }
16608
17130
 
@@ -16983,9 +17505,10 @@ int32_t llama_tokenize(
16983
17505
 
16984
17506
  static std::string llama_decode_text(const std::string & text) {
16985
17507
  std::string decoded_text;
16986
- auto unicode_sequences = unicode_cpts_from_utf8(text);
16987
- for (auto & unicode_sequence : unicode_sequences) {
16988
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17508
+
17509
+ const auto cpts = unicode_cpts_from_utf8(text);
17510
+ for (const auto cpt : cpts) {
17511
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
16989
17512
  }
16990
17513
 
16991
17514
  return decoded_text;
@@ -17257,6 +17780,15 @@ static int32_t llama_chat_apply_template_internal(
17257
17780
  if (add_ass) {
17258
17781
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17259
17782
  }
17783
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17784
+ // Phi 3
17785
+ for (auto message : chat) {
17786
+ std::string role(message->role);
17787
+ ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17788
+ }
17789
+ if (add_ass) {
17790
+ ss << "<|assistant|>\n";
17791
+ }
17260
17792
  } else {
17261
17793
  // template not supported
17262
17794
  return -1;
@@ -17340,7 +17872,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
17340
17872
  /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
17341
17873
 
17342
17874
  /*.n_sample =*/ std::max(1, ctx->n_sample),
17343
- /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
17875
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
17344
17876
  /*.n_eval =*/ std::max(1, ctx->n_eval),
17345
17877
  };
17346
17878
 
@@ -17389,6 +17921,11 @@ const char * llama_print_system_info(void) {
17389
17921
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
17390
17922
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
17391
17923
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17924
+ #ifdef GGML_USE_LLAMAFILE
17925
+ s += "LLAMAFILE = 1 | ";
17926
+ #else
17927
+ s += "LLAMAFILE = 0 | ";
17928
+ #endif
17392
17929
 
17393
17930
  return s.c_str();
17394
17931
  }