llama_cpp 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,12 @@
11
11
  # include "ggml-cuda.h"
12
12
  #elif defined(GGML_USE_CLBLAST)
13
13
  # include "ggml-opencl.h"
14
+ #elif defined(GGML_USE_VULKAN)
15
+ # include "ggml-vulkan.h"
16
+ #elif defined(GGML_USE_SYCL)
17
+ # include "ggml-sycl.h"
18
+ #elif defined(GGML_USE_KOMPUTE)
19
+ # include "ggml-kompute.h"
14
20
  #endif
15
21
 
16
22
  #ifdef GGML_USE_METAL
@@ -52,6 +58,7 @@
52
58
  #include <algorithm>
53
59
  #include <array>
54
60
  #include <cassert>
61
+ #include <cfloat>
55
62
  #include <cinttypes>
56
63
  #include <climits>
57
64
  #include <cmath>
@@ -192,8 +199,12 @@ enum llm_arch {
192
199
  LLM_ARCH_BLOOM,
193
200
  LLM_ARCH_STABLELM,
194
201
  LLM_ARCH_QWEN,
202
+ LLM_ARCH_QWEN2,
195
203
  LLM_ARCH_PHI2,
196
204
  LLM_ARCH_PLAMO,
205
+ LLM_ARCH_CODESHELL,
206
+ LLM_ARCH_ORION,
207
+ LLM_ARCH_INTERNLM2,
197
208
  LLM_ARCH_UNKNOWN,
198
209
  };
199
210
 
@@ -211,8 +222,12 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
211
222
  { LLM_ARCH_BLOOM, "bloom" },
212
223
  { LLM_ARCH_STABLELM, "stablelm" },
213
224
  { LLM_ARCH_QWEN, "qwen" },
225
+ { LLM_ARCH_QWEN2, "qwen2" },
214
226
  { LLM_ARCH_PHI2, "phi2" },
215
227
  { LLM_ARCH_PLAMO, "plamo" },
228
+ { LLM_ARCH_CODESHELL, "codeshell" },
229
+ { LLM_ARCH_ORION, "orion" },
230
+ { LLM_ARCH_INTERNLM2, "internlm2" },
216
231
  };
217
232
 
218
233
  enum llm_kv {
@@ -265,6 +280,7 @@ enum llm_kv {
265
280
  LLM_KV_TOKENIZER_PAD_ID,
266
281
  LLM_KV_TOKENIZER_ADD_BOS,
267
282
  LLM_KV_TOKENIZER_ADD_EOS,
283
+ LLM_KV_TOKENIZER_ADD_PREFIX,
268
284
  LLM_KV_TOKENIZER_HF_JSON,
269
285
  LLM_KV_TOKENIZER_RWKV,
270
286
  };
@@ -319,6 +335,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
319
335
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
320
336
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
321
337
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
338
+ { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
322
339
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
323
340
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
324
341
  };
@@ -566,6 +583,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
566
583
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
567
584
  },
568
585
  },
586
+ {
587
+ LLM_ARCH_QWEN2,
588
+ {
589
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
590
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
591
+ { LLM_TENSOR_OUTPUT, "output" },
592
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
593
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
594
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
595
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
596
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
597
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
598
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
599
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
600
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
601
+ },
602
+ },
569
603
  {
570
604
  LLM_ARCH_PHI2,
571
605
  {
@@ -600,7 +634,62 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
600
634
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
601
635
  },
602
636
  },
603
-
637
+ {
638
+ LLM_ARCH_CODESHELL,
639
+ {
640
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
641
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
642
+ { LLM_TENSOR_OUTPUT, "output" },
643
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
644
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
645
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
646
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
647
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
648
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
649
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
650
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
651
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
652
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
653
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
654
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
655
+ },
656
+ },
657
+ {
658
+ LLM_ARCH_ORION,
659
+ {
660
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
661
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
662
+ { LLM_TENSOR_OUTPUT, "output" },
663
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
664
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
665
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
666
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
667
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
668
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
669
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
670
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
671
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
672
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
673
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
674
+ },
675
+ },
676
+ {
677
+ LLM_ARCH_INTERNLM2,
678
+ {
679
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
680
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
681
+ { LLM_TENSOR_OUTPUT, "output" },
682
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
683
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
684
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
685
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
686
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
687
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
688
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
689
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
690
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
691
+ },
692
+ },
604
693
  {
605
694
  LLM_ARCH_UNKNOWN,
606
695
  {
@@ -1091,10 +1180,10 @@ struct llama_mlock {
1091
1180
  #ifdef __APPLE__
1092
1181
  #define MLOCK_SUGGESTION \
1093
1182
  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
1094
- "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
1183
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
1095
1184
  #else
1096
1185
  #define MLOCK_SUGGESTION \
1097
- "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
1186
+ "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
1098
1187
  #endif
1099
1188
 
1100
1189
  bool raw_lock(const void * addr, size_t size) const {
@@ -1215,8 +1304,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1215
1304
  if (host_buffer) {
1216
1305
  buft = ggml_backend_cuda_host_buffer_type();
1217
1306
  }
1307
+ #elif defined(GGML_USE_SYCL)
1308
+ buft = ggml_backend_sycl_host_buffer_type();
1218
1309
  #elif defined(GGML_USE_CPU_HBM)
1219
1310
  buft = ggml_backend_cpu_hbm_buffer_type();
1311
+ #elif defined(GGML_USE_VULKAN)
1312
+ if (host_buffer) {
1313
+ buft = ggml_backend_vk_host_buffer_type();
1314
+ }
1220
1315
  #endif
1221
1316
 
1222
1317
  if (buft == nullptr) {
@@ -1234,8 +1329,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1234
1329
  buft = ggml_backend_metal_buffer_type();
1235
1330
  #elif defined(GGML_USE_CUBLAS)
1236
1331
  buft = ggml_backend_cuda_buffer_type(gpu);
1332
+ #elif defined(GGML_USE_VULKAN)
1333
+ buft = ggml_backend_vk_buffer_type();
1334
+ #elif defined(GGML_USE_SYCL)
1335
+ buft = ggml_backend_sycl_buffer_type(gpu);
1237
1336
  #elif defined(GGML_USE_CLBLAST)
1238
1337
  buft = ggml_backend_opencl_buffer_type();
1338
+ #elif defined(GGML_USE_KOMPUTE)
1339
+ buft = ggml_backend_kompute_buffer_type(gpu);
1340
+ if (buft == nullptr) {
1341
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1342
+ }
1239
1343
  #endif
1240
1344
 
1241
1345
  if (buft == nullptr) {
@@ -1284,12 +1388,16 @@ static llama_state g_state;
1284
1388
  // available llama models
1285
1389
  enum e_model {
1286
1390
  MODEL_UNKNOWN,
1391
+ MODEL_0_5B,
1287
1392
  MODEL_1B,
1288
1393
  MODEL_3B,
1394
+ MODEL_4B,
1289
1395
  MODEL_7B,
1290
1396
  MODEL_8B,
1291
1397
  MODEL_13B,
1398
+ MODEL_14B,
1292
1399
  MODEL_15B,
1400
+ MODEL_20B,
1293
1401
  MODEL_30B,
1294
1402
  MODEL_34B,
1295
1403
  MODEL_40B,
@@ -1393,6 +1501,9 @@ struct llama_cparams {
1393
1501
 
1394
1502
  bool mul_mat_q;
1395
1503
  bool offload_kqv;
1504
+
1505
+ ggml_backend_sched_eval_callback cb_eval;
1506
+ void * cb_eval_user_data;
1396
1507
  };
1397
1508
 
1398
1509
  struct llama_layer {
@@ -1528,6 +1639,8 @@ struct llama_vocab {
1528
1639
  id special_suffix_id = 32008;
1529
1640
  id special_eot_id = 32010;
1530
1641
 
1642
+ bool add_space_prefix = true;
1643
+
1531
1644
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
1532
1645
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
1533
1646
  GGML_ASSERT(token_left.find('\n') == std::string::npos);
@@ -1596,7 +1709,7 @@ struct llama_model {
1596
1709
  std::unique_ptr<llama_mmap> mapping;
1597
1710
 
1598
1711
  // objects representing data potentially being locked in memory
1599
- llama_mlock mlock_buf;
1712
+ std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
1600
1713
  llama_mlock mlock_mmap;
1601
1714
 
1602
1715
  // for quantize-stats only
@@ -1623,6 +1736,9 @@ struct llama_context {
1623
1736
  for (ggml_backend_t backend : backends) {
1624
1737
  ggml_backend_free(backend);
1625
1738
  }
1739
+
1740
+ ggml_backend_buffer_free(buf_input);
1741
+ ggml_free(ctx_input);
1626
1742
  }
1627
1743
 
1628
1744
  llama_cparams cparams;
@@ -1669,8 +1785,14 @@ struct llama_context {
1669
1785
  // allocator for the input tensors
1670
1786
  ggml_tallocr * alloc = nullptr;
1671
1787
 
1672
- // temporary buffer for copying data to/from the backend
1673
- std::vector<no_init<uint8_t>> buf_copy;
1788
+ // input tensors
1789
+ ggml_backend_buffer_t buf_input = nullptr;
1790
+ ggml_context * ctx_input = nullptr;
1791
+ struct ggml_tensor * inp_tokens; // I32 [n_batch]
1792
+ struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1793
+ struct ggml_tensor * inp_pos; // I32 [n_batch]
1794
+ struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1795
+ struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1674
1796
 
1675
1797
  #ifdef GGML_USE_MPI
1676
1798
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2254,20 +2376,21 @@ struct llama_model_loader {
2254
2376
  }
2255
2377
 
2256
2378
  switch (type_max) {
2257
- case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
2258
- case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
2259
- case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
2260
- case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
2261
- case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
2262
- case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
2263
- case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
2264
- case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
2265
- case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
2266
- case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2267
- case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2268
- case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2379
+ case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
2380
+ case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
2381
+ case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
2382
+ case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
2383
+ case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
2384
+ case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
2385
+ case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
2386
+ case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
2387
+ case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
2388
+ case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2389
+ case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2390
+ case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2269
2391
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2270
2392
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2393
+ case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2271
2394
  default:
2272
2395
  {
2273
2396
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2613,8 +2736,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2613
2736
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2614
2737
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2615
2738
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2616
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2739
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2617
2740
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2741
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2742
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2618
2743
 
2619
2744
  default: return "unknown, may not work";
2620
2745
  }
@@ -2627,7 +2752,9 @@ static const char * llama_model_type_name(e_model type) {
2627
2752
  case MODEL_7B: return "7B";
2628
2753
  case MODEL_8B: return "8B";
2629
2754
  case MODEL_13B: return "13B";
2755
+ case MODEL_14B: return "14B";
2630
2756
  case MODEL_15B: return "15B";
2757
+ case MODEL_20B: return "20B";
2631
2758
  case MODEL_30B: return "30B";
2632
2759
  case MODEL_34B: return "34B";
2633
2760
  case MODEL_40B: return "40B";
@@ -2640,6 +2767,14 @@ static const char * llama_model_type_name(e_model type) {
2640
2767
  default: return "?B";
2641
2768
  }
2642
2769
  }
2770
+ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2771
+ switch (type) {
2772
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2773
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2774
+ default: return "unknown";
2775
+ }
2776
+ }
2777
+
2643
2778
 
2644
2779
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2645
2780
  model.arch = ml.get_arch();
@@ -2830,6 +2965,7 @@ static void llm_load_hparams(
2830
2965
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2831
2966
 
2832
2967
  switch (hparams.n_layer) {
2968
+ case 24: model.type = e_model::MODEL_1B; break;
2833
2969
  case 32: model.type = e_model::MODEL_3B; break;
2834
2970
  default: model.type = e_model::MODEL_UNKNOWN;
2835
2971
  }
@@ -2844,6 +2980,17 @@ static void llm_load_hparams(
2844
2980
  default: model.type = e_model::MODEL_UNKNOWN;
2845
2981
  }
2846
2982
  } break;
2983
+ case LLM_ARCH_QWEN2:
2984
+ {
2985
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2986
+ switch (hparams.n_layer) {
2987
+ case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
2988
+ case 32: model.type = e_model::MODEL_7B; break;
2989
+ case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
2990
+ case 80: model.type = e_model::MODEL_70B; break;
2991
+ default: model.type = e_model::MODEL_UNKNOWN;
2992
+ }
2993
+ } break;
2847
2994
  case LLM_ARCH_PHI2:
2848
2995
  {
2849
2996
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2874,7 +3021,32 @@ static void llm_load_hparams(
2874
3021
  default: model.type = e_model::MODEL_UNKNOWN;
2875
3022
  }
2876
3023
  } break;
3024
+ case LLM_ARCH_CODESHELL:
3025
+ {
3026
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3027
+ switch (hparams.n_layer) {
3028
+ case 42: model.type = e_model::MODEL_SMALL; break;
3029
+ default: model.type = e_model::MODEL_UNKNOWN;
3030
+ }
3031
+ } break;
3032
+ case LLM_ARCH_ORION:
3033
+ {
3034
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2877
3035
 
3036
+ switch (hparams.n_layer) {
3037
+ case 40: model.type = e_model::MODEL_14B; break;
3038
+ default: model.type = e_model::MODEL_UNKNOWN;
3039
+ }
3040
+ } break;
3041
+ case LLM_ARCH_INTERNLM2:
3042
+ {
3043
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3044
+ switch (hparams.n_layer) {
3045
+ case 32: model.type = e_model::MODEL_7B; break;
3046
+ case 48: model.type = e_model::MODEL_20B; break;
3047
+ default: model.type = e_model::MODEL_UNKNOWN;
3048
+ }
3049
+ } break;
2878
3050
  default: (void)0;
2879
3051
  }
2880
3052
 
@@ -2926,6 +3098,11 @@ static void llm_load_vocab(
2926
3098
  vocab.special_unk_id = 0;
2927
3099
  vocab.special_sep_id = -1;
2928
3100
  vocab.special_pad_id = -1;
3101
+
3102
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
3103
+ if (add_space_prefix_keyidx != -1) {
3104
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
3105
+ } // The default value of add_space_prefix is true.
2929
3106
  } else if (tokenizer_name == "gpt2") {
2930
3107
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
2931
3108
 
@@ -3138,7 +3315,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3138
3315
  // hparams
3139
3316
  LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
3140
3317
  LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
3141
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
3318
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
3142
3319
  LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
3143
3320
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
3144
3321
  LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
@@ -3435,7 +3612,12 @@ static bool llm_load_tensors(
3435
3612
  {
3436
3613
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3437
3614
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3438
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3615
+ if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
3616
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3617
+ } else {
3618
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
3619
+ ml.n_created--; // artificial tensor
3620
+ }
3439
3621
  }
3440
3622
 
3441
3623
  for (int i = 0; i < n_layer; ++i) {
@@ -3629,6 +3811,11 @@ static bool llm_load_tensors(
3629
3811
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3630
3812
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3631
3813
 
3814
+ // optional bias tensors, present in Stable LM 2 1.6B
3815
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
3816
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
3817
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
3818
+
3632
3819
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3633
3820
  layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3634
3821
 
@@ -3666,6 +3853,41 @@ static bool llm_load_tensors(
3666
3853
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
3667
3854
  }
3668
3855
  } break;
3856
+ case LLM_ARCH_QWEN2:
3857
+ {
3858
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3859
+
3860
+ // output
3861
+ {
3862
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3863
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3864
+ }
3865
+
3866
+ for (int i = 0; i < n_layer; ++i) {
3867
+ ggml_context * ctx_layer = ctx_for_layer(i);
3868
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3869
+
3870
+ auto & layer = model.layers[i];
3871
+
3872
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3873
+
3874
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3875
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3876
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3877
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3878
+
3879
+ // optional bias tensors
3880
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3881
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3882
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3883
+
3884
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3885
+
3886
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3887
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3888
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3889
+ }
3890
+ } break;
3669
3891
  case LLM_ARCH_PHI2:
3670
3892
  {
3671
3893
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -3776,6 +3998,101 @@ static bool llm_load_tensors(
3776
3998
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3777
3999
  }
3778
4000
  } break;
4001
+ case LLM_ARCH_CODESHELL:
4002
+ {
4003
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4004
+
4005
+ // output
4006
+ {
4007
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4008
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4009
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4010
+ }
4011
+
4012
+ for (int i = 0; i < n_layer; ++i) {
4013
+ ggml_context * ctx_layer = ctx_for_layer(i);
4014
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4015
+
4016
+ auto & layer = model.layers[i];
4017
+
4018
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4019
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4020
+
4021
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4022
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
4023
+
4024
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4025
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4026
+
4027
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4028
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4029
+
4030
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4031
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4032
+
4033
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4034
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
4035
+ }
4036
+ } break;
4037
+ case LLM_ARCH_ORION:
4038
+ {
4039
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4040
+ {
4041
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4042
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4043
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4044
+ }
4045
+ for (int i = 0; i < n_layer; ++i) {
4046
+ ggml_context * ctx_layer = ctx_for_layer(i);
4047
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4048
+
4049
+ auto & layer = model.layers[i];
4050
+
4051
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4052
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4053
+
4054
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4055
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4056
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4057
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4058
+
4059
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4060
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4061
+
4062
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4063
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4064
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4065
+ }
4066
+ } break;
4067
+ case LLM_ARCH_INTERNLM2:
4068
+ {
4069
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4070
+
4071
+ // output
4072
+ {
4073
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4074
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4075
+ }
4076
+
4077
+ for (int i = 0; i < n_layer; ++i) {
4078
+ ggml_context * ctx_layer = ctx_for_layer(i);
4079
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4080
+
4081
+ auto & layer = model.layers[i];
4082
+
4083
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4084
+ // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4085
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4086
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4087
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4088
+
4089
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4090
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4091
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4092
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4093
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4094
+ }
4095
+ } break;
3779
4096
  default:
3780
4097
  throw std::runtime_error("unknown architecture");
3781
4098
  }
@@ -3812,8 +4129,10 @@ static bool llm_load_tensors(
3812
4129
  else {
3813
4130
  buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
3814
4131
  if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
3815
- model.mlock_buf.init (ggml_backend_buffer_get_base(buf));
3816
- model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
4132
+ model.mlock_bufs.emplace_back(new llama_mlock);
4133
+ auto & mlock_buf = model.mlock_bufs.back();
4134
+ mlock_buf->init (ggml_backend_buffer_get_base(buf));
4135
+ mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
3817
4136
  }
3818
4137
  }
3819
4138
  if (buf == nullptr) {
@@ -3870,7 +4189,7 @@ static bool llm_load_tensors(
3870
4189
  }
3871
4190
 
3872
4191
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
3873
- static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
4192
+ static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
3874
4193
  try {
3875
4194
  llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
3876
4195
 
@@ -3891,6 +4210,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
3891
4210
  return 0;
3892
4211
  }
3893
4212
 
4213
+ #ifdef GGML_USE_KOMPUTE
4214
+ if (params.n_gpu_layers > 0 && (
4215
+ !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
4216
+ || !(
4217
+ model.ftype == LLAMA_FTYPE_ALL_F32 ||
4218
+ model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
4219
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
4220
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
4221
+ )
4222
+ )) {
4223
+ // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
4224
+ LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
4225
+ params.n_gpu_layers = 0;
4226
+ }
4227
+ #endif
4228
+
3894
4229
  if (!llm_load_tensors(
3895
4230
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
3896
4231
  params.progress_callback, params.progress_callback_user_data
@@ -3939,22 +4274,24 @@ static struct ggml_tensor * llm_build_inp_embd(
3939
4274
  const llama_hparams & hparams,
3940
4275
  const llama_batch & batch,
3941
4276
  struct ggml_tensor * tok_embd,
4277
+ struct ggml_tensor * inp_tokens,
4278
+ struct ggml_tensor * inp_embd,
3942
4279
  const llm_build_cb & cb) {
3943
4280
  const int64_t n_embd = hparams.n_embd;
3944
4281
 
3945
4282
  struct ggml_tensor * inpL;
3946
4283
 
3947
4284
  if (batch.token) {
3948
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
4285
+ struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
3949
4286
  cb(inp_tokens, "inp_tokens", -1);
3950
4287
 
3951
- inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
4288
+ inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
3952
4289
  } else {
3953
4290
  #ifdef GGML_USE_MPI
3954
4291
  GGML_ASSERT(false && "not implemented");
3955
4292
  #endif
3956
4293
 
3957
- inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
4294
+ inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
3958
4295
  }
3959
4296
 
3960
4297
  return inpL;
@@ -3968,6 +4305,7 @@ static void llm_build_k_shift(
3968
4305
  const llama_cparams & cparams,
3969
4306
  const llama_kv_cache & kv,
3970
4307
  struct ggml_cgraph * graph,
4308
+ struct ggml_tensor * K_shift,
3971
4309
  llm_rope_type type,
3972
4310
  int64_t n_ctx,
3973
4311
  float freq_base,
@@ -3984,9 +4322,6 @@ static void llm_build_k_shift(
3984
4322
  const float beta_fast = cparams.yarn_beta_fast;
3985
4323
  const float beta_slow = cparams.yarn_beta_slow;
3986
4324
 
3987
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
3988
- cb(K_shift, "K_shift", -1);
3989
-
3990
4325
  int rope_type = 0;
3991
4326
 
3992
4327
  switch (type) {
@@ -4174,6 +4509,7 @@ static struct ggml_tensor * llm_build_kqv(
4174
4509
  const llama_model & model,
4175
4510
  const llama_hparams & hparams,
4176
4511
  const llama_kv_cache & kv,
4512
+ struct ggml_cgraph * graph,
4177
4513
  struct ggml_tensor * wo,
4178
4514
  struct ggml_tensor * wo_b,
4179
4515
  struct ggml_tensor * q_cur,
@@ -4252,6 +4588,8 @@ static struct ggml_tensor * llm_build_kqv(
4252
4588
  struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
4253
4589
  cb(cur, "kqv_merged_cont", il);
4254
4590
 
4591
+ ggml_build_forward_expand(graph, cur);
4592
+
4255
4593
  cur = ggml_mul_mat(ctx, wo, cur);
4256
4594
  if (wo_b) {
4257
4595
  cb(cur, "kqv_wo", il);
@@ -4264,17 +4602,56 @@ static struct ggml_tensor * llm_build_kqv(
4264
4602
  return cur;
4265
4603
  }
4266
4604
 
4267
- struct llm_build_context {
4268
- const llama_model & model;
4269
- const llama_hparams & hparams;
4270
- const llama_cparams & cparams;
4271
- const llama_batch & batch;
4272
- const llama_kv_cache & kv_self;
4273
-
4274
- const int64_t n_embd;
4275
- const int64_t n_layer;
4276
- const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
4277
- const int64_t n_head;
4605
+ static struct ggml_tensor * llm_build_kv(
4606
+ struct ggml_context * ctx,
4607
+ const llama_model & model,
4608
+ const llama_hparams & hparams,
4609
+ const llama_kv_cache & kv,
4610
+ struct ggml_cgraph * graph,
4611
+ struct ggml_tensor * wo,
4612
+ struct ggml_tensor * wo_b,
4613
+ struct ggml_tensor * k_cur,
4614
+ struct ggml_tensor * v_cur,
4615
+ struct ggml_tensor * q_cur,
4616
+ struct ggml_tensor * kq_mask,
4617
+ int64_t n_ctx,
4618
+ int32_t n_tokens,
4619
+ int32_t kv_head,
4620
+ int32_t n_kv,
4621
+ float max_alibi_bias,
4622
+ float kq_scale,
4623
+ const llm_build_cb & cb,
4624
+ int il) {
4625
+
4626
+ // these nodes are added to the graph together so that they are not reordered
4627
+ // by doing so, the number of splits in the graph is reduced
4628
+ ggml_build_forward_expand(graph, q_cur);
4629
+ ggml_build_forward_expand(graph, k_cur);
4630
+ ggml_build_forward_expand(graph, v_cur);
4631
+
4632
+ llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4633
+
4634
+ struct ggml_tensor * cur;
4635
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph,
4636
+ wo, wo_b,
4637
+ q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
4638
+ cb(cur, "kqv_out", il);
4639
+
4640
+ return cur;
4641
+ }
4642
+
4643
+ struct llm_build_context {
4644
+ const llama_model & model;
4645
+ const llama_context & lctx;
4646
+ const llama_hparams & hparams;
4647
+ const llama_cparams & cparams;
4648
+ const llama_batch & batch;
4649
+ const llama_kv_cache & kv_self;
4650
+
4651
+ const int64_t n_embd;
4652
+ const int64_t n_layer;
4653
+ const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
4654
+ const int64_t n_head;
4278
4655
  const int64_t n_head_kv;
4279
4656
  const int64_t n_embd_head_k;
4280
4657
  const int64_t n_embd_k_gqa;
@@ -4312,6 +4689,7 @@ struct llm_build_context {
4312
4689
  const llm_build_cb & cb,
4313
4690
  bool worst_case) :
4314
4691
  model (lctx.model),
4692
+ lctx (lctx),
4315
4693
  hparams (model.hparams),
4316
4694
  cparams (lctx.cparams),
4317
4695
  batch (batch),
@@ -4372,20 +4750,20 @@ struct llm_build_context {
4372
4750
  struct ggml_tensor * cur;
4373
4751
  struct ggml_tensor * inpL;
4374
4752
 
4375
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4753
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4376
4754
  cb(inpL, "inp_embd", -1);
4377
4755
 
4378
4756
  // inp_pos - contains the positions
4379
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4757
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4380
4758
  cb(inp_pos, "inp_pos", -1);
4381
4759
 
4382
4760
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4383
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4761
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4384
4762
  cb(KQ_mask, "KQ_mask", -1);
4385
4763
 
4386
4764
  // shift the entire K-cache if needed
4387
4765
  if (do_rope_shift) {
4388
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4766
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4389
4767
  }
4390
4768
 
4391
4769
  for (int il = 0; il < n_layer; ++il) {
@@ -4421,12 +4799,6 @@ struct llm_build_context {
4421
4799
  cb(Vcur, "Vcur", il);
4422
4800
  }
4423
4801
 
4424
- // these nodes are added to the graph together so that they are not reordered
4425
- // by doing so, the number of splits in the graph is reduced
4426
- ggml_build_forward_expand(gf, Qcur);
4427
- ggml_build_forward_expand(gf, Kcur);
4428
- ggml_build_forward_expand(gf, Vcur);
4429
-
4430
4802
  Qcur = ggml_rope_custom(
4431
4803
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4432
4804
  hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
@@ -4441,11 +4813,9 @@ struct llm_build_context {
4441
4813
  );
4442
4814
  cb(Kcur, "Kcur", il);
4443
4815
 
4444
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4445
-
4446
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4816
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4447
4817
  model.layers[il].wo, model.layers[il].bo,
4448
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4818
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4449
4819
  cb(cur, "kqv_out", il);
4450
4820
  }
4451
4821
 
@@ -4564,20 +4934,20 @@ struct llm_build_context {
4564
4934
  struct ggml_tensor * cur;
4565
4935
  struct ggml_tensor * inpL;
4566
4936
 
4567
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
4937
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4568
4938
  cb(inpL, "inp_embd", -1);
4569
4939
 
4570
4940
  // inp_pos - contains the positions
4571
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4941
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4572
4942
  cb(inp_pos, "inp_pos", -1);
4573
4943
 
4574
4944
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4575
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4945
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4576
4946
  cb(KQ_mask, "KQ_mask", -1);
4577
4947
 
4578
4948
  // shift the entire K-cache if needed
4579
4949
  if (do_rope_shift) {
4580
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4950
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4581
4951
  }
4582
4952
 
4583
4953
  for (int il = 0; il < n_layer; ++il) {
@@ -4622,14 +4992,13 @@ struct llm_build_context {
4622
4992
  cb(Qcur, "Qcur", il);
4623
4993
  cb(Kcur, "Kcur", il);
4624
4994
 
4625
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4626
4995
 
4627
4996
  // apply ALiBi for 13B model
4628
4997
  const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
4629
4998
 
4630
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
4999
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4631
5000
  model.layers[il].wo, NULL,
4632
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5001
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4633
5002
  cb(cur, "kqv_out", il);
4634
5003
  }
4635
5004
 
@@ -4686,20 +5055,20 @@ struct llm_build_context {
4686
5055
  struct ggml_tensor * cur;
4687
5056
  struct ggml_tensor * inpL;
4688
5057
 
4689
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5058
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4690
5059
  cb(inpL, "inp_embd", -1);
4691
5060
 
4692
5061
  // inp_pos - contains the positions
4693
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5062
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4694
5063
  cb(inp_pos, "inp_pos", -1);
4695
5064
 
4696
5065
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4697
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5066
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4698
5067
  cb(KQ_mask, "KQ_mask", -1);
4699
5068
 
4700
5069
  // shift the entire K-cache if needed
4701
5070
  if (do_rope_shift) {
4702
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5071
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
4703
5072
  }
4704
5073
 
4705
5074
  for (int il = 0; il < n_layer; ++il) {
@@ -4751,11 +5120,9 @@ struct llm_build_context {
4751
5120
  );
4752
5121
  cb(Kcur, "Kcur", il);
4753
5122
 
4754
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4755
-
4756
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5123
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4757
5124
  model.layers[il].wo, NULL,
4758
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5125
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4759
5126
  cb(cur, "kqv_out", il);
4760
5127
  }
4761
5128
 
@@ -4810,15 +5177,15 @@ struct llm_build_context {
4810
5177
  struct ggml_tensor * pos;
4811
5178
  struct ggml_tensor * inpL;
4812
5179
 
4813
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5180
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4814
5181
  cb(inpL, "inp_embd", -1);
4815
5182
 
4816
5183
  // inp_pos - contains the positions
4817
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5184
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4818
5185
  cb(inp_pos, "inp_pos", -1);
4819
5186
 
4820
5187
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4821
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5188
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4822
5189
  cb(KQ_mask, "KQ_mask", -1);
4823
5190
 
4824
5191
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@@ -4852,11 +5219,9 @@ struct llm_build_context {
4852
5219
 
4853
5220
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
4854
5221
 
4855
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4856
-
4857
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5222
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4858
5223
  model.layers[il].wo, model.layers[il].bo,
4859
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5224
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4860
5225
  cb(cur, "kqv_out", il);
4861
5226
  }
4862
5227
 
@@ -4909,19 +5274,19 @@ struct llm_build_context {
4909
5274
  struct ggml_tensor * cur;
4910
5275
  struct ggml_tensor * inpL;
4911
5276
 
4912
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5277
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4913
5278
  cb(inpL, "inp_embd", -1);
4914
5279
 
4915
5280
  // inp_pos - contains the positions
4916
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5281
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4917
5282
  cb(inp_pos, "inp_pos", -1);
4918
5283
 
4919
5284
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4920
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5285
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4921
5286
  cb(KQ_mask, "KQ_mask", -1);
4922
5287
 
4923
5288
  if (do_rope_shift) {
4924
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5289
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
4925
5290
  }
4926
5291
 
4927
5292
  for (int il = 0; il < n_layer; ++il) {
@@ -5059,12 +5424,9 @@ struct llm_build_context {
5059
5424
  );
5060
5425
  cb(Vcur, "Vcur", il);
5061
5426
 
5062
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5063
-
5064
- // TODO: not tested, could be broken
5065
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5427
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5066
5428
  model.layers[il].wo, model.layers[il].bo,
5067
- Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5429
+ Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5068
5430
  cb(cur, "kqv_out", il);
5069
5431
  }
5070
5432
 
@@ -5119,11 +5481,11 @@ struct llm_build_context {
5119
5481
  struct ggml_tensor * cur;
5120
5482
  struct ggml_tensor * inpL;
5121
5483
 
5122
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5484
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5123
5485
  cb(inpL, "inp_embd", -1);
5124
5486
 
5125
5487
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5126
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5488
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5127
5489
  cb(KQ_mask, "KQ_mask", -1);
5128
5490
 
5129
5491
  for (int il = 0; il < n_layer; ++il) {
@@ -5151,11 +5513,9 @@ struct llm_build_context {
5151
5513
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5152
5514
  cb(Qcur, "Qcur", il);
5153
5515
 
5154
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5155
-
5156
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5516
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5157
5517
  model.layers[il].wo, NULL,
5158
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5518
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5159
5519
  cb(cur, "kqv_out", il);
5160
5520
  }
5161
5521
 
@@ -5211,11 +5571,11 @@ struct llm_build_context {
5211
5571
  struct ggml_tensor * cur;
5212
5572
  struct ggml_tensor * inpL;
5213
5573
 
5214
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5574
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5215
5575
  cb(inpL, "inp_embd", -1);
5216
5576
 
5217
5577
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5218
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5578
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5219
5579
  cb(KQ_mask, "KQ_mask", -1);
5220
5580
 
5221
5581
  inpL = llm_build_norm(ctx0, inpL, hparams,
@@ -5249,11 +5609,9 @@ struct llm_build_context {
5249
5609
 
5250
5610
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5251
5611
 
5252
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5253
-
5254
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5612
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5255
5613
  model.layers[il].wo, model.layers[il].bo,
5256
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5614
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5257
5615
  cb(cur, "kqv_out", il);
5258
5616
  }
5259
5617
 
@@ -5306,11 +5664,11 @@ struct llm_build_context {
5306
5664
  struct ggml_tensor * cur;
5307
5665
  struct ggml_tensor * inpL;
5308
5666
 
5309
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5667
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5310
5668
  cb(inpL, "inp_embd", -1);
5311
5669
 
5312
5670
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5313
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5671
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5314
5672
  cb(KQ_mask, "KQ_mask", -1);
5315
5673
 
5316
5674
  for (int il = 0; il < n_layer; ++il) {
@@ -5344,11 +5702,9 @@ struct llm_build_context {
5344
5702
 
5345
5703
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5346
5704
 
5347
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5348
-
5349
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5705
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5350
5706
  model.layers[il].wo, NULL,
5351
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5707
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5352
5708
  cb(cur, "kqv_out", il);
5353
5709
  }
5354
5710
 
@@ -5404,20 +5760,20 @@ struct llm_build_context {
5404
5760
  struct ggml_tensor * cur;
5405
5761
  struct ggml_tensor * inpL;
5406
5762
 
5407
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5763
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5408
5764
  cb(inpL, "inp_embd", -1);
5409
5765
 
5410
5766
  // inp_pos - contains the positions
5411
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5767
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5412
5768
  cb(inp_pos, "inp_pos", -1);
5413
5769
 
5414
5770
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5415
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5771
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5416
5772
  cb(KQ_mask, "KQ_mask", -1);
5417
5773
 
5418
5774
  // shift the entire K-cache if needed
5419
5775
  if (do_rope_shift) {
5420
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5776
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5421
5777
  }
5422
5778
 
5423
5779
  for (int il = 0; il < n_layer; ++il) {
@@ -5435,12 +5791,24 @@ struct llm_build_context {
5435
5791
  // compute Q and K and RoPE them
5436
5792
  struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
5437
5793
  cb(Qcur, "Qcur", il);
5794
+ if (model.layers[il].bq) {
5795
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
5796
+ cb(Qcur, "Qcur", il);
5797
+ }
5438
5798
 
5439
5799
  struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5440
5800
  cb(Kcur, "Kcur", il);
5801
+ if (model.layers[il].bk) {
5802
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
5803
+ cb(Kcur, "Kcur", il);
5804
+ }
5441
5805
 
5442
5806
  struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5443
5807
  cb(Vcur, "Vcur", il);
5808
+ if (model.layers[il].bv) {
5809
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
5810
+ cb(Vcur, "Vcur", il);
5811
+ }
5444
5812
 
5445
5813
  Qcur = ggml_rope_custom(
5446
5814
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -5456,11 +5824,9 @@ struct llm_build_context {
5456
5824
  );
5457
5825
  cb(Kcur, "Kcur", il);
5458
5826
 
5459
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5460
-
5461
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5827
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5462
5828
  model.layers[il].wo, NULL,
5463
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5829
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5464
5830
  cb(cur, "kqv_out", il);
5465
5831
  }
5466
5832
 
@@ -5517,20 +5883,20 @@ struct llm_build_context {
5517
5883
  struct ggml_tensor * cur;
5518
5884
  struct ggml_tensor * inpL;
5519
5885
 
5520
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5886
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5521
5887
  cb(inpL, "inp_embd", -1);
5522
5888
 
5523
5889
  // inp_pos - contains the positions
5524
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5890
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5525
5891
  cb(inp_pos, "inp_pos", -1);
5526
5892
 
5527
5893
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5528
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5894
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5529
5895
  cb(KQ_mask, "KQ_mask", -1);
5530
5896
 
5531
5897
  // shift the entire K-cache if needed
5532
5898
  if (do_rope_shift) {
5533
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5899
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5534
5900
  }
5535
5901
 
5536
5902
  for (int il = 0; il < n_layer; ++il) {
@@ -5573,11 +5939,9 @@ struct llm_build_context {
5573
5939
  );
5574
5940
  cb(Kcur, "Kcur", il);
5575
5941
 
5576
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5577
-
5578
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5942
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5579
5943
  model.layers[il].wo, NULL,
5580
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5944
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5581
5945
  cb(cur, "kqv_out", il);
5582
5946
  }
5583
5947
 
@@ -5622,6 +5986,126 @@ struct llm_build_context {
5622
5986
 
5623
5987
  return gf;
5624
5988
  }
5989
+
5990
+ struct ggml_cgraph * build_qwen2() {
5991
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5992
+
5993
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5994
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5995
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5996
+
5997
+ struct ggml_tensor * cur;
5998
+ struct ggml_tensor * inpL;
5999
+
6000
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6001
+ cb(inpL, "inp_embd", -1);
6002
+
6003
+ // inp_pos - contains the positions
6004
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6005
+ cb(inp_pos, "inp_pos", -1);
6006
+
6007
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6008
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6009
+ cb(KQ_mask, "KQ_mask", -1);
6010
+
6011
+ // shift the entire K-cache if needed
6012
+ if (do_rope_shift) {
6013
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6014
+ }
6015
+
6016
+ for (int il = 0; il < n_layer; ++il) {
6017
+ struct ggml_tensor * inpSA = inpL;
6018
+
6019
+ // norm
6020
+ cur = llm_build_norm(ctx0, inpL, hparams,
6021
+ model.layers[il].attn_norm, NULL,
6022
+ LLM_NORM_RMS, cb, il);
6023
+ cb(cur, "attn_norm", il);
6024
+
6025
+ // self-attention
6026
+ {
6027
+ // compute Q and K and RoPE them
6028
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6029
+ cb(Qcur, "Qcur", il);
6030
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6031
+ cb(Qcur, "Qcur", il);
6032
+
6033
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6034
+ cb(Kcur, "Kcur", il);
6035
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6036
+ cb(Kcur, "Kcur", il);
6037
+
6038
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6039
+ cb(Vcur, "Vcur", il);
6040
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6041
+ cb(Vcur, "Vcur", il);
6042
+
6043
+ // these nodes are added to the graph together so that they are not reordered
6044
+ // by doing so, the number of splits in the graph is reduced
6045
+ ggml_build_forward_expand(gf, Qcur);
6046
+ ggml_build_forward_expand(gf, Kcur);
6047
+ ggml_build_forward_expand(gf, Vcur);
6048
+
6049
+ Qcur = ggml_rope_custom(
6050
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6051
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6052
+ ext_factor, attn_factor, beta_fast, beta_slow
6053
+ );
6054
+ cb(Qcur, "Qcur", il);
6055
+
6056
+ Kcur = ggml_rope_custom(
6057
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6058
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6059
+ ext_factor, attn_factor, beta_fast, beta_slow
6060
+ );
6061
+ cb(Kcur, "Kcur", il);
6062
+
6063
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6064
+ model.layers[il].wo, model.layers[il].bo,
6065
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6066
+ cb(cur, "kqv_out", il);
6067
+ }
6068
+
6069
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6070
+ cb(ffn_inp, "ffn_inp", il);
6071
+
6072
+ // feed-forward network
6073
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6074
+ model.layers[il].ffn_norm, NULL,
6075
+ LLM_NORM_RMS, cb, il);
6076
+ cb(cur, "ffn_norm", il);
6077
+
6078
+ cur = llm_build_ffn(ctx0, cur,
6079
+ model.layers[il].ffn_up, NULL,
6080
+ model.layers[il].ffn_gate, NULL,
6081
+ model.layers[il].ffn_down, NULL,
6082
+ NULL,
6083
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6084
+ cb(cur, "ffn_out", il);
6085
+
6086
+ cur = ggml_add(ctx0, cur, ffn_inp);
6087
+ cb(cur, "l_out", il);
6088
+
6089
+ // input for next layer
6090
+ inpL = cur;
6091
+ }
6092
+
6093
+ cur = inpL;
6094
+
6095
+ cur = llm_build_norm(ctx0, cur, hparams,
6096
+ model.output_norm, NULL,
6097
+ LLM_NORM_RMS, cb, -1);
6098
+ cb(cur, "result_norm", -1);
6099
+
6100
+ // lm_head
6101
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6102
+ cb(cur, "result_output", -1);
6103
+
6104
+ ggml_build_forward_expand(gf, cur);
6105
+
6106
+ return gf;
6107
+ }
6108
+
5625
6109
  struct ggml_cgraph * build_phi2() {
5626
6110
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5627
6111
 
@@ -5634,20 +6118,20 @@ struct llm_build_context {
5634
6118
  struct ggml_tensor * ffn_output;
5635
6119
  struct ggml_tensor * inpL;
5636
6120
 
5637
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
6121
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5638
6122
  cb(inpL, "inp_embd", -1);
5639
6123
 
5640
6124
  // inp_pos - contains the positions
5641
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
6125
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5642
6126
  cb(inp_pos, "inp_pos", -1);
5643
6127
 
5644
6128
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5645
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
6129
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5646
6130
  cb(KQ_mask, "KQ_mask", -1);
5647
6131
 
5648
6132
  // shift the entire K-cache if needed
5649
6133
  if (do_rope_shift) {
5650
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
6134
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5651
6135
  }
5652
6136
 
5653
6137
  for (int il = 0; il < n_layer; ++il) {
@@ -5703,11 +6187,9 @@ struct llm_build_context {
5703
6187
  );
5704
6188
  cb(Kcur, "Kcur", il);
5705
6189
 
5706
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5707
-
5708
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6190
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5709
6191
  model.layers[il].wo, model.layers[il].bo,
5710
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
6192
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
5711
6193
  cb(cur, "kqv_out", il);
5712
6194
  }
5713
6195
 
@@ -5758,20 +6240,20 @@ struct llm_build_context {
5758
6240
  struct ggml_tensor * cur;
5759
6241
  struct ggml_tensor * inpL;
5760
6242
 
5761
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
6243
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5762
6244
  cb(inpL, "inp_embd", -1);
5763
6245
 
5764
6246
  // inp_pos - contains the positions
5765
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
6247
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5766
6248
  cb(inp_pos, "inp_pos", -1);
5767
6249
 
5768
6250
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5769
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
6251
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5770
6252
  cb(KQ_mask, "KQ_mask", -1);
5771
6253
 
5772
6254
  // shift the entire K-cache if needed
5773
6255
  if (do_rope_shift) {
5774
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6256
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5775
6257
  }
5776
6258
 
5777
6259
  for (int il = 0; il < n_layer; ++il) {
@@ -5808,11 +6290,9 @@ struct llm_build_context {
5808
6290
  ext_factor, attn_factor, beta_fast, beta_slow);
5809
6291
  cb(Kcur, "Kcur", il);
5810
6292
 
5811
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5812
-
5813
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6293
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5814
6294
  model.layers[il].wo, NULL,
5815
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6295
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5816
6296
  cb(cur, "kqv_out", il);
5817
6297
  }
5818
6298
  struct ggml_tensor * sa_out = cur;
@@ -5867,15 +6347,15 @@ struct llm_build_context {
5867
6347
  struct ggml_tensor * pos;
5868
6348
  struct ggml_tensor * inpL;
5869
6349
 
5870
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
6350
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5871
6351
  cb(inpL, "inp_embd", -1);
5872
6352
 
5873
6353
  // inp_pos - contains the positions
5874
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
6354
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5875
6355
  cb(inp_pos, "inp_pos", -1);
5876
6356
 
5877
6357
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5878
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
6358
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5879
6359
  cb(KQ_mask, "KQ_mask", -1);
5880
6360
 
5881
6361
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@@ -5903,51 +6383,396 @@ struct llm_build_context {
5903
6383
  struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5904
6384
  struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5905
6385
 
5906
- cb(Qcur, "Qcur", il);
6386
+ cb(Qcur, "Qcur", il);
6387
+ cb(Kcur, "Kcur", il);
6388
+ cb(Vcur, "Vcur", il);
6389
+
6390
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6391
+
6392
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6393
+ model.layers[il].wo, model.layers[il].bo,
6394
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6395
+ cb(cur, "kqv_out", il);
6396
+ }
6397
+
6398
+ // add the input
6399
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6400
+ cb(ffn_inp, "ffn_inp", il);
6401
+
6402
+ // FF
6403
+ {
6404
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6405
+ model.layers[il].ffn_norm,
6406
+ model.layers[il].ffn_norm_b,
6407
+ LLM_NORM, cb, il);
6408
+ cb(cur, "ffn_norm", il);
6409
+
6410
+ cur = llm_build_ffn(ctx0, cur,
6411
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6412
+ NULL, NULL,
6413
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6414
+ NULL,
6415
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6416
+ cb(cur, "ffn_out", il);
6417
+ }
6418
+
6419
+ inpL = ggml_add(ctx0, cur, ffn_inp);
6420
+ cb(inpL, "l_out", il);
6421
+ }
6422
+
6423
+ cur = llm_build_norm(ctx0, inpL, hparams,
6424
+ model.output_norm,
6425
+ model.output_norm_b,
6426
+ LLM_NORM, cb, -1);
6427
+ cb(cur, "result_norm", -1);
6428
+
6429
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6430
+ cb(cur, "result_output", -1);
6431
+
6432
+ ggml_build_forward_expand(gf, cur);
6433
+
6434
+ return gf;
6435
+ }
6436
+
6437
+ struct ggml_cgraph * build_codeshell() {
6438
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6439
+
6440
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6441
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6442
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6443
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6444
+
6445
+ struct ggml_tensor * cur;
6446
+ struct ggml_tensor * inpL;
6447
+
6448
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6449
+ cb(inpL, "inp_embd", -1);
6450
+
6451
+ // inp_pos - contains the positions
6452
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6453
+ cb(inp_pos, "inp_pos", -1);
6454
+
6455
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6456
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6457
+ cb(KQ_mask, "KQ_mask", -1);
6458
+
6459
+ // shift the entire K-cache if needed
6460
+ if (do_rope_shift) {
6461
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6462
+ }
6463
+
6464
+ for (int il = 0; il < n_layer; ++il) {
6465
+ cur = llm_build_norm(ctx0, inpL, hparams,
6466
+ model.layers[il].attn_norm,
6467
+ model.layers[il].attn_norm_b,
6468
+ LLM_NORM, cb, il);
6469
+ cb(cur, "attn_norm", il);
6470
+
6471
+ // self-attention
6472
+ {
6473
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6474
+ cb(cur, "wqkv", il);
6475
+
6476
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6477
+ cb(cur, "bqkv", il);
6478
+
6479
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6480
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6481
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6482
+
6483
+ cb(tmpq, "tmpq", il);
6484
+ cb(tmpk, "tmpk", il);
6485
+ cb(Vcur, "Vcur", il);
6486
+
6487
+ struct ggml_tensor * Qcur = ggml_rope_custom(
6488
+ ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
6489
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6490
+ ext_factor, attn_factor, beta_fast, beta_slow
6491
+ );
6492
+ cb(Qcur, "Qcur", il);
6493
+
6494
+ struct ggml_tensor * Kcur = ggml_rope_custom(
6495
+ ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
6496
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6497
+ ext_factor, attn_factor, beta_fast, beta_slow
6498
+ );
6499
+ cb(Kcur, "Kcur", il);
6500
+
6501
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6502
+ model.layers[il].wo, model.layers[il].bo,
6503
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6504
+ cb(cur, "kqv_out", il);
6505
+ }
6506
+
6507
+ // add the input
6508
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6509
+ cb(ffn_inp, "ffn_inp", il);
6510
+
6511
+ // FF
6512
+ {
6513
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6514
+ model.layers[il].ffn_norm,
6515
+ model.layers[il].ffn_norm_b,
6516
+ LLM_NORM, cb, il);
6517
+ cb(cur, "ffn_norm", il);
6518
+
6519
+ cur = llm_build_ffn(ctx0, cur,
6520
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6521
+ NULL, NULL,
6522
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6523
+ NULL,
6524
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6525
+ cb(cur, "ffn_out", il);
6526
+ }
6527
+
6528
+ inpL = ggml_add(ctx0, cur, ffn_inp);
6529
+ cb(inpL, "l_out", il);
6530
+ }
6531
+
6532
+ cur = llm_build_norm(ctx0, inpL, hparams,
6533
+ model.output_norm,
6534
+ model.output_norm_b,
6535
+ LLM_NORM, cb, -1);
6536
+ cb(cur, "result_norm", -1);
6537
+
6538
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6539
+ cb(cur, "result_output", -1);
6540
+
6541
+ ggml_build_forward_expand(gf, cur);
6542
+
6543
+ return gf;
6544
+ }
6545
+
6546
+ struct ggml_cgraph * build_orion() {
6547
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6548
+
6549
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6550
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6551
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6552
+
6553
+ struct ggml_tensor * cur;
6554
+ struct ggml_tensor * inpL;
6555
+
6556
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6557
+ cb(inpL, "inp_embd", -1);
6558
+
6559
+ // inp_pos - contains the positions
6560
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6561
+ cb(inp_pos, "inp_pos", -1);
6562
+
6563
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6564
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6565
+ cb(KQ_mask, "KQ_mask", -1);
6566
+
6567
+ // shift the entire K-cache if needed
6568
+ if (do_rope_shift) {
6569
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6570
+ }
6571
+
6572
+ for (int il = 0; il < n_layer; ++il) {
6573
+ struct ggml_tensor * inpSA = inpL;
6574
+
6575
+ // norm
6576
+ cur = llm_build_norm(ctx0, inpL, hparams,
6577
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
6578
+ LLM_NORM, cb, il);
6579
+ cb(cur, "attn_norm", il);
6580
+
6581
+ // self-attention
6582
+ {
6583
+ // compute Q and K and RoPE them
6584
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6585
+ cb(Qcur, "Qcur", il);
6586
+ // if (model.layers[il].bq) {
6587
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6588
+ // cb(Qcur, "Qcur", il);
6589
+ // }
6590
+
6591
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6592
+ cb(Kcur, "Kcur", il);
6593
+ // if (model.layers[il].bk) {
6594
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6595
+ // cb(Kcur, "Kcur", il);
6596
+ // }
6597
+
6598
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6599
+ cb(Vcur, "Vcur", il);
6600
+ // if (model.layers[il].bv) {
6601
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6602
+ // cb(Vcur, "Vcur", il);
6603
+ // }
6604
+
6605
+ Qcur = ggml_rope_custom(
6606
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6607
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6608
+ ext_factor, attn_factor, beta_fast, beta_slow
6609
+ );
6610
+ cb(Qcur, "Qcur", il);
6611
+
6612
+ Kcur = ggml_rope_custom(
6613
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6614
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6615
+ ext_factor, attn_factor, beta_fast, beta_slow
6616
+ );
6617
+ cb(Kcur, "Kcur", il);
6618
+
6619
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6620
+ model.layers[il].wo, NULL,
6621
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6622
+ cb(cur, "kqv_out", il);
6623
+ }
6624
+
6625
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6626
+ cb(ffn_inp, "ffn_inp", il);
6627
+
6628
+ // feed-forward network
6629
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6630
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
6631
+ LLM_NORM, cb, il);
6632
+ cb(cur, "ffn_norm", il);
6633
+
6634
+ cur = llm_build_ffn(ctx0, cur,
6635
+ model.layers[il].ffn_up, NULL,
6636
+ model.layers[il].ffn_gate, NULL,
6637
+ model.layers[il].ffn_down, NULL,
6638
+ NULL,
6639
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6640
+ cb(cur, "ffn_out", il);
6641
+
6642
+ cur = ggml_add(ctx0, cur, ffn_inp);
6643
+ cb(cur, "l_out", il);
6644
+
6645
+ // input for next layer
6646
+ inpL = cur;
6647
+ }
6648
+
6649
+ cur = inpL;
6650
+
6651
+ cur = llm_build_norm(ctx0, cur, hparams,
6652
+ model.output_norm, model.output_norm_b,
6653
+ LLM_NORM, cb, -1);
6654
+ cb(cur, "result_norm", -1);
6655
+
6656
+ // lm_head
6657
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6658
+ cb(cur, "result_output", -1);
6659
+
6660
+ ggml_build_forward_expand(gf, cur);
6661
+
6662
+ return gf;
6663
+ }
6664
+
6665
+ struct ggml_cgraph * build_internlm2() {
6666
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6667
+
6668
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6669
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6670
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6671
+
6672
+ struct ggml_tensor * cur;
6673
+ struct ggml_tensor * inpL;
6674
+
6675
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6676
+ cb(inpL, "inp_embd", -1);
6677
+
6678
+ // inp_pos - contains the positions
6679
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6680
+ cb(inp_pos, "inp_pos", -1);
6681
+
6682
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6683
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6684
+ cb(KQ_mask, "KQ_mask", -1);
6685
+
6686
+ // shift the entire K-cache if needed
6687
+ if (do_rope_shift) {
6688
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6689
+ }
6690
+
6691
+ for (int il = 0; il < n_layer; ++il) {
6692
+ struct ggml_tensor * inpSA = inpL;
6693
+
6694
+ // norm
6695
+ cur = llm_build_norm(ctx0, inpL, hparams,
6696
+ model.layers[il].attn_norm, NULL,
6697
+ LLM_NORM_RMS, cb, il);
6698
+ cb(cur, "attn_norm", il);
6699
+
6700
+ // self-attention
6701
+ {
6702
+ // compute Q and K and RoPE them
6703
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6704
+ cb(Qcur, "Qcur", il);
6705
+ if (model.layers[il].bq) {
6706
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6707
+ cb(Qcur, "Qcur", il);
6708
+ }
6709
+
6710
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5907
6711
  cb(Kcur, "Kcur", il);
6712
+ if (model.layers[il].bk) {
6713
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6714
+ cb(Kcur, "Kcur", il);
6715
+ }
6716
+
6717
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5908
6718
  cb(Vcur, "Vcur", il);
6719
+ if (model.layers[il].bv) {
6720
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6721
+ cb(Vcur, "Vcur", il);
6722
+ }
5909
6723
 
5910
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6724
+ Qcur = ggml_rope_custom(
6725
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6726
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6727
+ ext_factor, attn_factor, beta_fast, beta_slow
6728
+ );
6729
+ cb(Qcur, "Qcur", il);
5911
6730
 
5912
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6731
+ Kcur = ggml_rope_custom(
6732
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6733
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6734
+ ext_factor, attn_factor, beta_fast, beta_slow
6735
+ );
6736
+ cb(Kcur, "Kcur", il);
5913
6737
 
5914
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6738
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5915
6739
  model.layers[il].wo, model.layers[il].bo,
5916
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6740
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5917
6741
  cb(cur, "kqv_out", il);
5918
6742
  }
5919
6743
 
5920
- // add the input
5921
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6744
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5922
6745
  cb(ffn_inp, "ffn_inp", il);
5923
6746
 
5924
- // FF
5925
- {
5926
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
5927
- model.layers[il].ffn_norm,
5928
- model.layers[il].ffn_norm_b,
5929
- LLM_NORM, cb, il);
5930
- cb(cur, "ffn_norm", il);
6747
+ // feed-forward network
6748
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6749
+ model.layers[il].ffn_norm, NULL,
6750
+ LLM_NORM_RMS, cb, il);
6751
+ cb(cur, "ffn_norm", il);
5931
6752
 
5932
- cur = llm_build_ffn(ctx0, cur,
5933
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5934
- NULL, NULL,
5935
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5936
- NULL,
5937
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5938
- cb(cur, "ffn_out", il);
5939
- }
6753
+ cur = llm_build_ffn(ctx0, cur,
6754
+ model.layers[il].ffn_up, NULL,
6755
+ model.layers[il].ffn_gate, NULL,
6756
+ model.layers[il].ffn_down, NULL,
6757
+ NULL,
6758
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6759
+ cb(cur, "ffn_out", il);
5940
6760
 
5941
- inpL = ggml_add(ctx0, cur, ffn_inp);
5942
- cb(inpL, "l_out", il);
6761
+ cur = ggml_add(ctx0, cur, ffn_inp);
6762
+ cb(cur, "l_out", il);
6763
+
6764
+ // input for next layer
6765
+ inpL = cur;
5943
6766
  }
5944
6767
 
5945
- cur = llm_build_norm(ctx0, inpL, hparams,
5946
- model.output_norm,
5947
- model.output_norm_b,
5948
- LLM_NORM, cb, -1);
6768
+ cur = inpL;
6769
+
6770
+ cur = llm_build_norm(ctx0, cur, hparams,
6771
+ model.output_norm, NULL,
6772
+ LLM_NORM_RMS, cb, -1);
5949
6773
  cb(cur, "result_norm", -1);
5950
6774
 
6775
+ // lm_head
5951
6776
  cur = ggml_mul_mat(ctx0, model.output, cur);
5952
6777
  cb(cur, "result_output", -1);
5953
6778
 
@@ -5955,6 +6780,7 @@ struct llm_build_context {
5955
6780
 
5956
6781
  return gf;
5957
6782
  }
6783
+
5958
6784
  };
5959
6785
 
5960
6786
  static struct ggml_cgraph * llama_build_graph(
@@ -5965,15 +6791,7 @@ static struct ggml_cgraph * llama_build_graph(
5965
6791
  // check if we should build the worst-case graph (for memory measurement)
5966
6792
  const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
5967
6793
 
5968
- // keep track of the input that has already been allocated
5969
- bool alloc_inp_tokens = false;
5970
- bool alloc_inp_embd = false;
5971
- bool alloc_inp_pos = false;
5972
- bool alloc_inp_KQ_mask = false;
5973
- bool alloc_inp_K_shift = false;
5974
-
5975
6794
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
5976
- // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
5977
6795
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
5978
6796
  if (il >= 0) {
5979
6797
  ggml_format_name(cur, "%s-%d", name, il);
@@ -5981,118 +6799,78 @@ static struct ggml_cgraph * llama_build_graph(
5981
6799
  ggml_set_name(cur, name);
5982
6800
  }
5983
6801
 
5984
- //
5985
- // allocate input tensors and set input data
5986
- //
5987
-
5988
- if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
5989
- ggml_tallocr_alloc(lctx.alloc, cur);
5990
-
5991
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
5992
- const int64_t n_tokens = cur->ne[0];
5993
-
5994
- ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
6802
+ if (!lctx.cparams.offload_kqv) {
6803
+ if (strcmp(name, "kqv_merged_cont") == 0) {
6804
+ // all nodes between the KV store and the attention output are run on the CPU
6805
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
5995
6806
  }
5996
-
5997
- alloc_inp_tokens = true;
5998
6807
  }
6808
+ };
5999
6809
 
6000
- if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
6001
- ggml_tallocr_alloc(lctx.alloc, cur);
6810
+ struct ggml_cgraph * result = NULL;
6002
6811
 
6003
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
6004
- const int64_t n_embd = cur->ne[0];
6005
- const int64_t n_tokens = cur->ne[1];
6812
+ struct llm_build_context llm(lctx, batch, cb, worst_case);
6006
6813
 
6007
- ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
6008
- }
6814
+ //
6815
+ // set input data
6816
+ //
6817
+
6818
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
6819
+ if (batch.token) {
6820
+ const int64_t n_tokens = batch.n_tokens;
6009
6821
 
6010
- alloc_inp_embd = true;
6822
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
6011
6823
  }
6012
6824
 
6013
- if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
6014
- ggml_tallocr_alloc(lctx.alloc, cur);
6825
+ if (batch.embd) {
6826
+ const int64_t n_embd = llm.n_embd;
6827
+ const int64_t n_tokens = batch.n_tokens;
6015
6828
 
6016
- if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
6017
- const int64_t n_tokens = cur->ne[0];
6829
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
6830
+ }
6018
6831
 
6019
- static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
6020
- ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
6021
- }
6832
+ if (batch.pos) {
6833
+ const int64_t n_tokens = batch.n_tokens;
6022
6834
 
6023
- alloc_inp_pos = true;
6835
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
6024
6836
  }
6025
6837
 
6026
- if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
6027
- ggml_tallocr_alloc(lctx.alloc, cur);
6838
+ {
6839
+ const int64_t n_kv = llm.n_kv;
6840
+ const int64_t n_tokens = batch.n_tokens;
6028
6841
 
6029
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
6030
- const int64_t n_kv = cur->ne[0];
6031
- const int64_t n_tokens = cur->ne[1];
6842
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
6843
+ float * data = (float *) lctx.inp_KQ_mask->data;
6032
6844
 
6033
- float * data;
6034
- if (ggml_backend_buffer_is_host(cur->buffer)) {
6035
- data = (float *) cur->data;
6036
- } else {
6037
- lctx.buf_copy.resize(ggml_nbytes(cur));
6038
- data = (float *) lctx.buf_copy.data();
6039
- }
6845
+ for (int h = 0; h < 1; ++h) {
6846
+ for (int j = 0; j < n_tokens; ++j) {
6847
+ const llama_pos pos = batch.pos[j];
6848
+ const llama_seq_id seq_id = batch.seq_id[j][0];
6040
6849
 
6041
- for (int h = 0; h < 1; ++h) {
6042
- for (int j = 0; j < n_tokens; ++j) {
6043
- const llama_pos pos = batch.pos[j];
6044
- const llama_seq_id seq_id = batch.seq_id[j][0];
6045
-
6046
- for (int i = 0; i < n_kv; ++i) {
6047
- float f;
6048
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
6049
- f = -INFINITY;
6050
- } else {
6051
- f = 0;
6052
- }
6053
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
6850
+ for (int i = 0; i < n_kv; ++i) {
6851
+ float f;
6852
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
6853
+ f = -INFINITY;
6854
+ } else {
6855
+ f = 0;
6054
6856
  }
6857
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
6055
6858
  }
6056
6859
  }
6057
-
6058
- if (data != cur->data) {
6059
- ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6060
- }
6061
6860
  }
6062
-
6063
- alloc_inp_KQ_mask = true;
6064
6861
  }
6065
6862
 
6066
- if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
6067
- ggml_tallocr_alloc(lctx.alloc, cur);
6068
-
6069
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
6070
- const int64_t n_ctx = cur->ne[0];
6071
-
6072
- int32_t * data;
6073
- if (ggml_backend_buffer_is_host(cur->buffer)) {
6074
- data = (int32_t *) cur->data;
6075
- } else {
6076
- lctx.buf_copy.resize(ggml_nbytes(cur));
6077
- data = (int32_t *) lctx.buf_copy.data();
6078
- }
6863
+ if (llm.do_rope_shift) {
6864
+ const int64_t n_ctx = llm.n_ctx;
6079
6865
 
6080
- for (int i = 0; i < n_ctx; ++i) {
6081
- data[i] = lctx.kv_self.cells[i].delta;
6082
- }
6866
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
6867
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
6083
6868
 
6084
- if (data != cur->data) {
6085
- ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6086
- }
6869
+ for (int i = 0; i < n_ctx; ++i) {
6870
+ data[i] = lctx.kv_self.cells[i].delta;
6087
6871
  }
6088
-
6089
- alloc_inp_K_shift = true;
6090
6872
  }
6091
- };
6092
-
6093
- struct ggml_cgraph * result = NULL;
6094
-
6095
- struct llm_build_context llm(lctx, batch, cb, worst_case);
6873
+ }
6096
6874
 
6097
6875
  llm.init();
6098
6876
 
@@ -6137,6 +6915,10 @@ static struct ggml_cgraph * llama_build_graph(
6137
6915
  {
6138
6916
  result = llm.build_qwen();
6139
6917
  } break;
6918
+ case LLM_ARCH_QWEN2:
6919
+ {
6920
+ result = llm.build_qwen2();
6921
+ } break;
6140
6922
  case LLM_ARCH_PHI2:
6141
6923
  {
6142
6924
  result = llm.build_phi2();
@@ -6149,6 +6931,18 @@ static struct ggml_cgraph * llama_build_graph(
6149
6931
  {
6150
6932
  result = llm.build_gpt2();
6151
6933
  } break;
6934
+ case LLM_ARCH_CODESHELL:
6935
+ {
6936
+ result = llm.build_codeshell();
6937
+ } break;
6938
+ case LLM_ARCH_ORION:
6939
+ {
6940
+ result = llm.build_orion();
6941
+ } break;
6942
+ case LLM_ARCH_INTERNLM2:
6943
+ {
6944
+ result = llm.build_internlm2();
6945
+ } break;
6152
6946
  default:
6153
6947
  GGML_ASSERT(false);
6154
6948
  }
@@ -6254,6 +7048,7 @@ static int llama_decode_internal(
6254
7048
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
6255
7049
 
6256
7050
  ggml_backend_sched_reset(lctx.sched);
7051
+ ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
6257
7052
 
6258
7053
  ggml_cgraph * gf = llama_build_graph(lctx, batch);
6259
7054
 
@@ -6279,11 +7074,6 @@ static int llama_decode_internal(
6279
7074
  n_threads = std::min(4, n_threads);
6280
7075
  }
6281
7076
 
6282
- const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
6283
- if (ggml_cpu_has_cublas() && fully_offloaded) {
6284
- n_threads = 1;
6285
- }
6286
-
6287
7077
  #ifdef GGML_USE_MPI
6288
7078
  const int64_t n_layer = hparams.n_layer;
6289
7079
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -7095,7 +7885,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7095
7885
  //
7096
7886
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7097
7887
  if (&fragment == &fragment_buffer.front()) {
7098
- raw_text = " " + raw_text; // prefix with space if the first token is not special
7888
+ if (vocab.add_space_prefix) {
7889
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
7890
+ }
7099
7891
  }
7100
7892
 
7101
7893
  #ifdef PRETOKENIZERDEBUG
@@ -7574,6 +8366,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
7574
8366
  }
7575
8367
 
7576
8368
  void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
8369
+ // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
8370
+ // if (k >= (int32_t)candidates->size) {
8371
+ // return;
8372
+ // }
8373
+
7577
8374
  const int64_t t_start_sample_us = ggml_time_us();
7578
8375
 
7579
8376
  k = std::max(k, (int) min_keep);
@@ -7584,10 +8381,57 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
7584
8381
  auto comp = [](const llama_token_data & a, const llama_token_data & b) {
7585
8382
  return a.logit > b.logit;
7586
8383
  };
7587
- if (k == (int) candidates->size) {
7588
- std::sort(candidates->data, candidates->data + candidates->size, comp);
7589
- } else {
8384
+ if (k <= 128) {
7590
8385
  std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
8386
+ } else {
8387
+ constexpr int nbuckets = 128;
8388
+ constexpr float bucket_low = -10.0f;
8389
+ constexpr float bucket_high = 10.0f;
8390
+ constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
8391
+ constexpr float bucker_inter = -bucket_low * bucket_scale;
8392
+
8393
+ std::vector<int> bucket_idx(candidates->size);
8394
+ std::vector<int> histo(nbuckets, 0);
8395
+
8396
+ for (int i = 0; i < (int)candidates->size; ++i) {
8397
+ const float val = candidates->data[i].logit;
8398
+ int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
8399
+ ib = std::max(0, std::min(nbuckets-1, ib));
8400
+ bucket_idx[i] = ib;
8401
+ ++histo[ib];
8402
+ }
8403
+ int nhave = 0;
8404
+ int ib = nbuckets - 1;
8405
+ for ( ; ib >= 0; --ib) {
8406
+ nhave += histo[ib];
8407
+ if (nhave >= k) break;
8408
+ }
8409
+ std::vector<llama_token_data> tmp_tokens(nhave);
8410
+ auto ptr = tmp_tokens.data();
8411
+ std::vector<llama_token_data*> bucket_ptrs;
8412
+ bucket_ptrs.reserve(nbuckets - ib);
8413
+ for (int j = nbuckets - 1; j >= ib; --j) {
8414
+ bucket_ptrs.push_back(ptr);
8415
+ ptr += histo[j];
8416
+ }
8417
+ for (int i = 0; i < (int)candidates->size; ++i) {
8418
+ int j = bucket_idx[i];
8419
+ if (j >= ib) {
8420
+ *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
8421
+ }
8422
+ }
8423
+
8424
+ ptr = tmp_tokens.data();
8425
+ int ndone = 0;
8426
+ for (int j = nbuckets-1; j > ib; --j) {
8427
+ std::sort(ptr, ptr + histo[j], comp);
8428
+ ptr += histo[j];
8429
+ ndone += histo[j];
8430
+ }
8431
+ std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
8432
+
8433
+ std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
8434
+
7591
8435
  }
7592
8436
  candidates->sorted = true;
7593
8437
  }
@@ -7635,21 +8479,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
7635
8479
  return;
7636
8480
  }
7637
8481
 
7638
- llama_sample_softmax(ctx, candidates);
7639
-
7640
8482
  const int64_t t_start_sample_us = ggml_time_us();
7641
8483
 
7642
- float scale = candidates->data[0].p; // scale by max prob
7643
- size_t i = 1; // first token always matches
8484
+ bool min_p_applied = false;
8485
+
8486
+ // if the candidates aren't sorted, try the unsorted implementation first
8487
+ if (!candidates->sorted) {
8488
+ std::vector<llama_token_data> filtered_tokens;
8489
+
8490
+ float max_logit = -FLT_MAX;
8491
+ for (size_t i = 0; i < candidates->size; ++i) {
8492
+ max_logit = std::max(max_logit, candidates->data[i].logit);
8493
+ }
8494
+ const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
8495
+
8496
+ for (size_t i = 0; i < candidates->size; ++i) {
8497
+ if (candidates->data[i].logit >= min_logit) {
8498
+ filtered_tokens.push_back(candidates->data[i]);
8499
+ }
8500
+ }
7644
8501
 
7645
- for (; i < candidates->size; ++i) {
7646
- if (candidates->data[i].p < p * scale && i >= min_keep) {
7647
- break; // prob too small
8502
+ // if we have enough values the operation was a success
8503
+ if (filtered_tokens.size() >= min_keep) {
8504
+ memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
8505
+ candidates->size = filtered_tokens.size();
8506
+ min_p_applied = true;
7648
8507
  }
7649
8508
  }
7650
8509
 
7651
- // Resize the output vector to keep only the matching tokens
7652
- candidates->size = i;
8510
+ // if the candidates are sorted or the unsorted implementation failed, use this implementation
8511
+ if (!min_p_applied) {
8512
+ // Sort the logits in descending order
8513
+ if (!candidates->sorted) {
8514
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
8515
+ return a.logit > b.logit;
8516
+ });
8517
+ candidates->sorted = true;
8518
+ }
8519
+
8520
+ const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
8521
+ size_t i = 1; // first token always matches
8522
+
8523
+ for (; i < candidates->size; ++i) {
8524
+ if (candidates->data[i].logit < min_logit && i >= min_keep) {
8525
+ break; // prob too small
8526
+ }
8527
+ }
8528
+
8529
+ // Resize the output vector to keep only the matching tokens
8530
+ candidates->size = i;
8531
+ }
7653
8532
 
7654
8533
  if (ctx) {
7655
8534
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -7779,6 +8658,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
7779
8658
  }
7780
8659
  }
7781
8660
 
8661
+ void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
8662
+ const int64_t t_start_sample_us = ggml_time_us();
8663
+
8664
+ // no need to do anything if there is only one (or zero) candidates
8665
+ if(candidates_p->size <= 1) {
8666
+ return;
8667
+ }
8668
+
8669
+ // Calculate maximum possible entropy
8670
+ float max_entropy = -logf(1.0f / candidates_p->size);
8671
+
8672
+ llama_sample_softmax(nullptr, candidates_p);
8673
+
8674
+ // Calculate entropy of the softmax probabilities
8675
+ float entropy = 0.0f;
8676
+ for (size_t i = 0; i < candidates_p->size; ++i) {
8677
+ float prob = candidates_p->data[i].p;
8678
+ if (prob > 0.0f) { // Ensure no log(0)
8679
+ entropy -= prob * logf(prob);
8680
+ }
8681
+ }
8682
+
8683
+ // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
8684
+ float normalized_entropy = entropy / max_entropy;
8685
+
8686
+ // Map the normalized entropy to the desired temperature range using the power function
8687
+ float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
8688
+
8689
+ #ifdef DEBUG
8690
+ LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
8691
+ LLAMA_LOG_INFO("Entropy: %f\n", entropy);
8692
+ LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
8693
+ LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
8694
+ LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
8695
+ LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
8696
+ #endif
8697
+
8698
+ // Apply the dynamically calculated temperature scaling
8699
+ for (size_t i = 0; i < candidates_p->size; ++i) {
8700
+ candidates_p->data[i].logit /= dyn_temp;
8701
+ }
8702
+
8703
+ // Re-compute softmax probabilities after scaling logits with dynamic temperature
8704
+ double max_l_double = candidates_p->data[0].logit;
8705
+ double cum_sum_double = 0.0;
8706
+ for (size_t i = 0; i < candidates_p->size; ++i) {
8707
+ double p = exp(candidates_p->data[i].logit - max_l_double);
8708
+ candidates_p->data[i].p = p; // Store the scaled probability
8709
+ cum_sum_double += p;
8710
+ }
8711
+ for (size_t i = 0; i < candidates_p->size; ++i) {
8712
+ candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
8713
+ }
8714
+
8715
+ #ifdef DEBUG
8716
+ // Print the updated top 25 probabilities after temperature scaling
8717
+ LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
8718
+ for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
8719
+ LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
8720
+ }
8721
+ #endif
8722
+
8723
+ if (ctx) {
8724
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
8725
+ }
8726
+ }
8727
+
7782
8728
  void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
7783
8729
  const int64_t t_start_sample_us = ggml_time_us();
7784
8730
 
@@ -8367,9 +9313,13 @@ struct quantize_state_internal {
8367
9313
  const llama_model_quantize_params * params;
8368
9314
 
8369
9315
  int n_attention_wv = 0;
8370
- int n_feed_forward_w2 = 0;
9316
+ int n_ffn_down = 0;
9317
+ int n_ffn_gate = 0;
9318
+ int n_ffn_up = 0;
8371
9319
  int i_attention_wv = 0;
8372
- int i_feed_forward_w2 = 0;
9320
+ int i_ffn_down = 0;
9321
+ int i_ffn_gate = 0;
9322
+ int i_ffn_up = 0;
8373
9323
 
8374
9324
  int n_k_quantized = 0;
8375
9325
  int n_fallback = 0;
@@ -8453,6 +9403,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8453
9403
  auto use_more_bits = [](int i_layer, int num_layers) -> bool {
8454
9404
  return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
8455
9405
  };
9406
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
9407
+ auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
9408
+ if (n_expert > 1) {
9409
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
9410
+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
9411
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
9412
+ // tensor name.
9413
+ n_layer /= n_expert;
9414
+ if (sscanf(name, "blk.%d.", &i_layer) != 1) {
9415
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
9416
+ }
9417
+ if (i_layer < 0 || i_layer >= n_layer) {
9418
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
9419
+ }
9420
+ }
9421
+ return std::make_pair(i_layer, n_layer);
9422
+ };
8456
9423
 
8457
9424
  if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
8458
9425
  int nx = tensor->ne[0];
@@ -8465,6 +9432,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8465
9432
  else if (new_type != GGML_TYPE_Q8_0) {
8466
9433
  new_type = GGML_TYPE_Q6_K;
8467
9434
  }
9435
+ } else if (name == "token_embd.weight") {
9436
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
9437
+ new_type = GGML_TYPE_Q2_K;
9438
+ }
9439
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9440
+ new_type = GGML_TYPE_Q4_K;
9441
+ }
8468
9442
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8469
9443
  if (name.find("attn_v.weight") != std::string::npos) {
8470
9444
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
@@ -8472,12 +9446,19 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8472
9446
  ++qs.i_attention_wv;
8473
9447
  }
8474
9448
  else if (name.find("ffn_down") != std::string::npos) {
8475
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
8476
- ++qs.i_feed_forward_w2;
9449
+ if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
9450
+ ++qs.i_ffn_down;
8477
9451
  }
8478
- else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8479
9452
  } else if (name.find("attn_v.weight") != std::string::npos) {
8480
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9453
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
9454
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
9455
+ }
9456
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
9457
+ new_type = GGML_TYPE_Q4_K;
9458
+ }
9459
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) {
9460
+ new_type = GGML_TYPE_Q4_K;
9461
+ }
8481
9462
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8482
9463
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8483
9464
  }
@@ -8505,29 +9486,19 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8505
9486
  // TODO: explore better strategies
8506
9487
  new_type = GGML_TYPE_Q8_0;
8507
9488
  }
8508
- } else if (name.find("ffn_down") != std::string::npos) {
8509
- const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8510
- int i_layer, n_layer;
8511
- if (n_expert == 1) {
8512
- i_layer = qs.i_feed_forward_w2;
8513
- n_layer = qs.n_feed_forward_w2;
8514
- } else {
8515
- // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8516
- // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8517
- // for getting the current layer as I initially thought, and we need to resort to parsing the
8518
- // tensor name.
8519
- n_layer = qs.n_feed_forward_w2 / n_expert;
8520
- if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8521
- throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8522
- }
8523
- if (i_layer < 0 || i_layer >= n_layer) {
8524
- throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
8525
- }
9489
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
9490
+ new_type = GGML_TYPE_Q2_K;
8526
9491
  }
9492
+ } else if (name.find("ffn_down") != std::string::npos) {
9493
+ auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
9494
+ int i_layer = info.first, n_layer = info.second;
8527
9495
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8528
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
9496
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
8529
9497
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
8530
9498
  }
9499
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9500
+ // if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
9501
+ //}
8531
9502
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8532
9503
  new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
8533
9504
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
@@ -8555,16 +9526,18 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8555
9526
  // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
8556
9527
  new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
8557
9528
  }
8558
- ++qs.i_feed_forward_w2;
9529
+ ++qs.i_ffn_down;
8559
9530
  } else if (name.find("attn_output.weight") != std::string::npos) {
8560
9531
  if (arch != LLM_ARCH_FALCON) {
8561
9532
  if (qs.model.hparams.n_expert == 8) {
8562
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
9533
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
9534
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
8563
9535
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8564
9536
  new_type = GGML_TYPE_Q5_K;
8565
9537
  }
8566
9538
  } else {
8567
9539
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
9540
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
8568
9541
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8569
9542
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8570
9543
  }
@@ -8577,6 +9550,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8577
9550
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8578
9551
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8579
9552
  }
9553
+ else if (name.find("ffn_gate") != std::string::npos) {
9554
+ auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
9555
+ int i_layer = info.first, n_layer = info.second;
9556
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
9557
+ new_type = GGML_TYPE_Q2_K;
9558
+ }
9559
+ ++qs.i_ffn_gate;
9560
+ }
9561
+ else if (name.find("ffn_up") != std::string::npos) {
9562
+ auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
9563
+ int i_layer = info.first, n_layer = info.second;
9564
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
9565
+ new_type = GGML_TYPE_Q2_K;
9566
+ }
9567
+ ++qs.i_ffn_up;
9568
+ }
9569
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9570
+ //}
8580
9571
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
8581
9572
  //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
8582
9573
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -8589,7 +9580,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8589
9580
  bool convert_incompatible_tensor = false;
8590
9581
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
8591
9582
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
8592
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
9583
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
9584
+ new_type == GGML_TYPE_IQ3_XXS) {
8593
9585
  int nx = tensor->ne[0];
8594
9586
  int ny = tensor->ne[1];
8595
9587
  if (nx % QK_K != 0) {
@@ -8603,6 +9595,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8603
9595
  switch (new_type) {
8604
9596
  case GGML_TYPE_IQ2_XXS:
8605
9597
  case GGML_TYPE_IQ2_XS:
9598
+ case GGML_TYPE_IQ3_XXS:
8606
9599
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
8607
9600
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
8608
9601
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -8631,8 +9624,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8631
9624
  case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
8632
9625
 
8633
9626
  // K-quants
9627
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S:
8634
9628
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
8635
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
9629
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
8636
9630
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
8637
9631
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
8638
9632
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -8643,6 +9637,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8643
9637
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
8644
9638
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
8645
9639
  case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9640
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
8646
9641
 
8647
9642
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
8648
9643
  }
@@ -8700,12 +9695,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8700
9695
  ++qs.n_attention_wv;
8701
9696
  }
8702
9697
  else if (name.find("ffn_down") != std::string::npos) {
8703
- ++qs.n_feed_forward_w2;
9698
+ ++qs.n_ffn_down;
9699
+ }
9700
+ else if (name.find("ffn_gate") != std::string::npos) {
9701
+ ++qs.n_ffn_gate;
9702
+ }
9703
+ else if (name.find("ffn_up") != std::string::npos) {
9704
+ ++qs.n_ffn_up;
8704
9705
  }
8705
9706
  }
8706
- if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
8707
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
8708
- __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
9707
+ if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
9708
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
9709
+ __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
8709
9710
  }
8710
9711
 
8711
9712
  size_t total_size_org = 0;
@@ -8738,8 +9739,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8738
9739
  // placeholder for the meta data
8739
9740
  ::zeros(fout, meta_size);
8740
9741
 
8741
- std::set<ggml_type> used_iq2;
8742
-
8743
9742
  for (int i = 0; i < ml.n_tensors; ++i) {
8744
9743
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
8745
9744
 
@@ -8792,11 +9791,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8792
9791
  } else {
8793
9792
  const size_t nelements = ggml_nelements(tensor);
8794
9793
 
8795
- if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
8796
- ggml_init_iq2_quantization(new_type);
8797
- used_iq2.insert(new_type);
8798
- }
8799
-
8800
9794
  const float * imatrix = nullptr;
8801
9795
  if (imatrix_data) {
8802
9796
  auto it = imatrix_data->find(tensor->name);
@@ -8922,10 +9916,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8922
9916
 
8923
9917
  fout.close();
8924
9918
 
8925
- for (auto type : used_iq2) {
8926
- ggml_deinit_iq2_quantization(type);
8927
- }
8928
-
8929
9919
  gguf_free(ctx_out);
8930
9920
 
8931
9921
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9271,6 +10261,8 @@ struct llama_context_params llama_context_default_params() {
9271
10261
  /*.yarn_beta_fast =*/ 32.0f,
9272
10262
  /*.yarn_beta_slow =*/ 1.0f,
9273
10263
  /*.yarn_orig_ctx =*/ 0,
10264
+ /*.cb_eval =*/ nullptr,
10265
+ /*.cb_eval_user_data =*/ nullptr,
9274
10266
  /*.type_k =*/ GGML_TYPE_F16,
9275
10267
  /*.type_v =*/ GGML_TYPE_F16,
9276
10268
  /*.mul_mat_q =*/ true,
@@ -9296,18 +10288,45 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9296
10288
  return result;
9297
10289
  }
9298
10290
 
9299
- int32_t llama_max_devices(void) {
9300
- return LLAMA_MAX_DEVICES;
10291
+ size_t llama_max_devices(void) {
10292
+ #if defined(GGML_USE_METAL)
10293
+ return 1;
10294
+ #elif defined(GGML_USE_CUBLAS)
10295
+ return GGML_CUDA_MAX_DEVICES;
10296
+ #elif defined(GGML_USE_SYCL)
10297
+ return GGML_SYCL_MAX_DEVICES;
10298
+ #else
10299
+ return 1;
10300
+ #endif
9301
10301
  }
9302
10302
 
9303
- bool llama_mmap_supported(void) {
10303
+ bool llama_supports_mmap(void) {
9304
10304
  return llama_mmap::SUPPORTED;
9305
10305
  }
9306
10306
 
9307
- bool llama_mlock_supported(void) {
10307
+ bool llama_supports_mlock(void) {
9308
10308
  return llama_mlock::SUPPORTED;
9309
10309
  }
9310
10310
 
10311
+ bool llama_supports_gpu_offload(void) {
10312
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
10313
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
10314
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
10315
+ return true;
10316
+ #else
10317
+ return false;
10318
+ #endif
10319
+ }
10320
+
10321
+ // deprecated:
10322
+ bool llama_mmap_supported(void) {
10323
+ return llama_supports_mmap();
10324
+ }
10325
+
10326
+ bool llama_mlock_supported(void) {
10327
+ return llama_supports_mlock();
10328
+ }
10329
+
9311
10330
  void llama_backend_init(bool numa) {
9312
10331
  ggml_time_init();
9313
10332
 
@@ -9331,6 +10350,7 @@ void llama_backend_free(void) {
9331
10350
  #ifdef GGML_USE_MPI
9332
10351
  ggml_mpi_backend_free();
9333
10352
  #endif
10353
+ ggml_quantize_free();
9334
10354
  }
9335
10355
 
9336
10356
  int64_t llama_time_us(void) {
@@ -9338,8 +10358,8 @@ int64_t llama_time_us(void) {
9338
10358
  }
9339
10359
 
9340
10360
  struct llama_model * llama_load_model_from_file(
9341
- const char * path_model,
9342
- struct llama_model_params params) {
10361
+ const char * path_model,
10362
+ struct llama_model_params params) {
9343
10363
  ggml_time_init();
9344
10364
 
9345
10365
  llama_model * model = new llama_model;
@@ -9411,6 +10431,9 @@ struct llama_context * llama_new_context_with_model(
9411
10431
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
9412
10432
  hparams.n_ctx_train;
9413
10433
 
10434
+ cparams.cb_eval = params.cb_eval;
10435
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
10436
+
9414
10437
  auto rope_scaling_type = params.rope_scaling_type;
9415
10438
  if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
9416
10439
  rope_scaling_type = hparams.rope_scaling_type_train;
@@ -9477,6 +10500,36 @@ struct llama_context * llama_new_context_with_model(
9477
10500
  }
9478
10501
  }
9479
10502
  }
10503
+ #elif defined(GGML_USE_VULKAN)
10504
+ if (model->n_gpu_layers > 0) {
10505
+ ggml_backend_t backend = ggml_backend_vk_init();
10506
+ if (backend == nullptr) {
10507
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
10508
+ llama_free(ctx);
10509
+ return nullptr;
10510
+ }
10511
+ ctx->backends.push_back(backend);
10512
+ }
10513
+ #elif defined(GGML_USE_SYCL)
10514
+ if (model->n_gpu_layers > 0) {
10515
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
10516
+ if (backend == nullptr) {
10517
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
10518
+ llama_free(ctx);
10519
+ return nullptr;
10520
+ }
10521
+ ctx->backends.push_back(backend);
10522
+ }
10523
+ #elif defined(GGML_USE_KOMPUTE)
10524
+ if (model->n_gpu_layers > 0) {
10525
+ auto * backend = ggml_backend_kompute_init(model->main_gpu);
10526
+ if (backend == nullptr) {
10527
+ LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
10528
+ llama_free(ctx);
10529
+ return nullptr;
10530
+ }
10531
+ ctx->backends.push_back(backend);
10532
+ }
9480
10533
  #endif
9481
10534
  ctx->backend_cpu = ggml_backend_cpu_init();
9482
10535
  if (ctx->backend_cpu == nullptr) {
@@ -9518,6 +10571,35 @@ struct llama_context * llama_new_context_with_model(
9518
10571
  ctx->embedding.resize(hparams.n_embd);
9519
10572
  }
9520
10573
 
10574
+ // graph inputs
10575
+ {
10576
+ ggml_init_params init_params = {
10577
+ /* .mem_size */ ggml_tensor_overhead()*5,
10578
+ /* .mem_buffer */ nullptr,
10579
+ /* .no_alloc */ true,
10580
+ };
10581
+ ctx->ctx_input = ggml_init(init_params);
10582
+
10583
+ ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10584
+ ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
10585
+ ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10586
+ ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
10587
+ ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
10588
+
10589
+ ggml_set_name(ctx->inp_tokens, "inp_tokens");
10590
+ ggml_set_name(ctx->inp_embd, "inp_embd");
10591
+ ggml_set_name(ctx->inp_pos, "inp_pos");
10592
+ ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
10593
+ ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
10594
+
10595
+ ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
10596
+
10597
+ LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
10598
+ ggml_backend_buffer_name(ctx->buf_input),
10599
+ ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
10600
+ }
10601
+
10602
+ // scheduler and compute buffers
9521
10603
  {
9522
10604
  // buffer types used for the compute buffer of each backend
9523
10605
  std::vector<ggml_backend_buffer_type_t> backend_buft;
@@ -9544,9 +10626,6 @@ struct llama_context * llama_new_context_with_model(
9544
10626
 
9545
10627
  // initialize scheduler with the worst-case graph
9546
10628
  ggml_backend_sched_init_measure(ctx->sched, gf);
9547
- // note: the number of splits during measure is higher than during inference due to the kv shift
9548
- int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
9549
- LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
9550
10629
  ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
9551
10630
 
9552
10631
  for (ggml_backend_t backend : ctx->backends) {
@@ -9555,6 +10634,10 @@ struct llama_context * llama_new_context_with_model(
9555
10634
  ggml_backend_buffer_name(buf),
9556
10635
  ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
9557
10636
  }
10637
+
10638
+ // note: the number of splits during measure is higher than during inference due to the kv shift
10639
+ int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
10640
+ LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
9558
10641
  }
9559
10642
  }
9560
10643
 
@@ -10294,22 +11377,24 @@ struct llama_batch llama_batch_get_one(
10294
11377
  };
10295
11378
  }
10296
11379
 
10297
- struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
11380
+ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
10298
11381
  llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
10299
11382
 
10300
11383
  if (embd) {
10301
- batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
11384
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
10302
11385
  } else {
10303
- batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
11386
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
10304
11387
  }
10305
11388
 
10306
- batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
10307
- batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
10308
- batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
10309
- for (int i = 0; i < n_tokens; ++i) {
11389
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
11390
+ batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
11391
+ batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
11392
+ for (int i = 0; i < n_tokens_alloc; ++i) {
10310
11393
  batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
10311
11394
  }
10312
- batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
11395
+ batch.seq_id[n_tokens_alloc] = nullptr;
11396
+
11397
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
10313
11398
 
10314
11399
  return batch;
10315
11400
  }
@@ -10320,7 +11405,7 @@ void llama_batch_free(struct llama_batch batch) {
10320
11405
  if (batch.pos) free(batch.pos);
10321
11406
  if (batch.n_seq_id) free(batch.n_seq_id);
10322
11407
  if (batch.seq_id) {
10323
- for (int i = 0; i < batch.n_tokens; ++i) {
11408
+ for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
10324
11409
  free(batch.seq_id[i]);
10325
11410
  }
10326
11411
  free(batch.seq_id);