llama_cpp 0.14.3 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
- #ifdef GGML_USE_CUBLAS
10
+ #ifdef GGML_USE_CUDA
11
11
  # include "ggml-cuda.h"
12
12
  #elif defined(GGML_USE_CLBLAST)
13
13
  # include "ggml-opencl.h"
@@ -52,12 +52,16 @@
52
52
  #define NOMINMAX
53
53
  #endif
54
54
  #include <windows.h>
55
+ #ifndef PATH_MAX
56
+ #define PATH_MAX MAX_PATH
57
+ #endif
55
58
  #include <io.h>
56
59
  #endif
57
60
 
58
61
  #include <algorithm>
59
62
  #include <array>
60
63
  #include <cassert>
64
+ #include <cctype>
61
65
  #include <cfloat>
62
66
  #include <cinttypes>
63
67
  #include <climits>
@@ -68,7 +72,6 @@
68
72
  #include <cstdio>
69
73
  #include <cstring>
70
74
  #include <ctime>
71
- #include <cwctype>
72
75
  #include <forward_list>
73
76
  #include <fstream>
74
77
  #include <functional>
@@ -192,6 +195,7 @@ enum llm_arch {
192
195
  LLM_ARCH_LLAMA,
193
196
  LLM_ARCH_FALCON,
194
197
  LLM_ARCH_BAICHUAN,
198
+ LLM_ARCH_GROK,
195
199
  LLM_ARCH_GPT2,
196
200
  LLM_ARCH_GPTJ,
197
201
  LLM_ARCH_GPTNEOX,
@@ -214,6 +218,7 @@ enum llm_arch {
214
218
  LLM_ARCH_GEMMA,
215
219
  LLM_ARCH_STARCODER2,
216
220
  LLM_ARCH_MAMBA,
221
+ LLM_ARCH_XVERSE,
217
222
  LLM_ARCH_COMMAND_R,
218
223
  LLM_ARCH_UNKNOWN,
219
224
  };
@@ -221,6 +226,7 @@ enum llm_arch {
221
226
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
222
227
  { LLM_ARCH_LLAMA, "llama" },
223
228
  { LLM_ARCH_FALCON, "falcon" },
229
+ { LLM_ARCH_GROK, "grok" },
224
230
  { LLM_ARCH_GPT2, "gpt2" },
225
231
  { LLM_ARCH_GPTJ, "gptj" },
226
232
  { LLM_ARCH_GPTNEOX, "gptneox" },
@@ -244,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
244
250
  { LLM_ARCH_GEMMA, "gemma" },
245
251
  { LLM_ARCH_STARCODER2, "starcoder2" },
246
252
  { LLM_ARCH_MAMBA, "mamba" },
253
+ { LLM_ARCH_XVERSE, "xverse" },
247
254
  { LLM_ARCH_COMMAND_R, "command-r" },
248
255
  { LLM_ARCH_UNKNOWN, "(unknown)" },
249
256
  };
@@ -254,6 +261,7 @@ enum llm_kv {
254
261
  LLM_KV_GENERAL_ALIGNMENT,
255
262
  LLM_KV_GENERAL_NAME,
256
263
  LLM_KV_GENERAL_AUTHOR,
264
+ LLM_KV_GENERAL_VERSION,
257
265
  LLM_KV_GENERAL_URL,
258
266
  LLM_KV_GENERAL_DESCRIPTION,
259
267
  LLM_KV_GENERAL_LICENSE,
@@ -290,6 +298,10 @@ enum llm_kv {
290
298
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
291
299
  LLM_KV_ROPE_SCALING_FINETUNED,
292
300
 
301
+ LLM_KV_SPLIT_NO,
302
+ LLM_KV_SPLIT_COUNT,
303
+ LLM_KV_SPLIT_TENSORS_COUNT,
304
+
293
305
  LLM_KV_SSM_INNER_SIZE,
294
306
  LLM_KV_SSM_CONV_KERNEL,
295
307
  LLM_KV_SSM_STATE_SIZE,
@@ -306,6 +318,8 @@ enum llm_kv {
306
318
  LLM_KV_TOKENIZER_UNK_ID,
307
319
  LLM_KV_TOKENIZER_SEP_ID,
308
320
  LLM_KV_TOKENIZER_PAD_ID,
321
+ LLM_KV_TOKENIZER_CLS_ID,
322
+ LLM_KV_TOKENIZER_MASK_ID,
309
323
  LLM_KV_TOKENIZER_ADD_BOS,
310
324
  LLM_KV_TOKENIZER_ADD_EOS,
311
325
  LLM_KV_TOKENIZER_ADD_PREFIX,
@@ -319,6 +333,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
319
333
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
320
334
  { LLM_KV_GENERAL_NAME, "general.name" },
321
335
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
336
+ { LLM_KV_GENERAL_VERSION, "general.version" },
322
337
  { LLM_KV_GENERAL_URL, "general.url" },
323
338
  { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
324
339
  { LLM_KV_GENERAL_LICENSE, "general.license" },
@@ -355,6 +370,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
355
370
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
356
371
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
357
372
 
373
+ { LLM_KV_SPLIT_NO, "split.no" },
374
+ { LLM_KV_SPLIT_COUNT, "split.count" },
375
+ { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
376
+
358
377
  { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
359
378
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
360
379
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
@@ -371,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
371
390
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
372
391
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
373
392
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
393
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
394
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
374
395
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
375
396
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
376
397
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
@@ -411,9 +432,12 @@ enum llm_tensor {
411
432
  LLM_TENSOR_FFN_DOWN,
412
433
  LLM_TENSOR_FFN_UP,
413
434
  LLM_TENSOR_FFN_ACT,
414
- LLM_TENSOR_FFN_DOWN_EXP,
435
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
415
436
  LLM_TENSOR_FFN_GATE_EXP,
416
437
  LLM_TENSOR_FFN_UP_EXP,
438
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
439
+ LLM_TENSOR_FFN_GATE_EXPS,
440
+ LLM_TENSOR_FFN_UP_EXPS,
417
441
  LLM_TENSOR_ATTN_Q_NORM,
418
442
  LLM_TENSOR_ATTN_K_NORM,
419
443
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -448,6 +472,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
448
472
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
449
473
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
450
474
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
475
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
476
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
477
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
451
478
  },
452
479
  },
453
480
  {
@@ -483,6 +510,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
483
510
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
484
511
  },
485
512
  },
513
+ {
514
+ LLM_ARCH_GROK,
515
+ {
516
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
517
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
518
+ { LLM_TENSOR_OUTPUT, "output" },
519
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
520
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
521
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
522
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
523
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
524
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
525
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
526
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
527
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
528
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
529
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
530
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
531
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
532
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
533
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
534
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
535
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
536
+ },
537
+ },
486
538
  {
487
539
  LLM_ARCH_GPT2,
488
540
  {
@@ -548,6 +600,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
548
600
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
549
601
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
550
602
  { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
603
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
604
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
605
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
551
606
  },
552
607
  },
553
608
  {
@@ -843,6 +898,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
843
898
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
844
899
  },
845
900
  },
901
+ {
902
+ LLM_ARCH_XVERSE,
903
+ {
904
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
905
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
906
+ { LLM_TENSOR_OUTPUT, "output" },
907
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
908
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
909
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
910
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
911
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
912
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
913
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
914
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
915
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
916
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
917
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
918
+ },
919
+ },
846
920
  {
847
921
  LLM_ARCH_COMMAND_R,
848
922
  {
@@ -856,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
856
930
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
857
931
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
858
932
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
933
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
934
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
859
935
  },
860
936
  },
861
937
  {
@@ -1030,7 +1106,7 @@ struct llama_file {
1030
1106
  size_t size;
1031
1107
 
1032
1108
  llama_file(const char * fname, const char * mode) {
1033
- fp = std::fopen(fname, mode);
1109
+ fp = ggml_fopen(fname, mode);
1034
1110
  if (fp == NULL) {
1035
1111
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1036
1112
  }
@@ -1099,6 +1175,7 @@ struct llama_file {
1099
1175
  }
1100
1176
  }
1101
1177
  };
1178
+ using llama_files = std::vector<std::unique_ptr<llama_file>>;
1102
1179
 
1103
1180
  struct llama_mmap {
1104
1181
  void * addr;
@@ -1299,6 +1376,7 @@ struct llama_mmap {
1299
1376
  }
1300
1377
  #endif
1301
1378
  };
1379
+ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
1302
1380
 
1303
1381
  // Represents some region of memory being locked using mlock or VirtualLock;
1304
1382
  // will automatically unlock on destruction.
@@ -1448,6 +1526,7 @@ struct llama_mlock {
1448
1526
  static void raw_unlock(const void * addr, size_t len) {}
1449
1527
  #endif
1450
1528
  };
1529
+ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1451
1530
 
1452
1531
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1453
1532
  std::vector<char> result(8, 0);
@@ -1467,7 +1546,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1467
1546
  static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
1468
1547
  ggml_backend_buffer_type_t buft = nullptr;
1469
1548
 
1470
- #if defined(GGML_USE_CUBLAS)
1549
+ #if defined(GGML_USE_CUDA)
1471
1550
  // host buffers should only be used when data is expected to be copied to/from the GPU
1472
1551
  if (host_buffer) {
1473
1552
  buft = ggml_backend_cuda_host_buffer_type();
@@ -1497,7 +1576,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1497
1576
 
1498
1577
  #ifdef GGML_USE_METAL
1499
1578
  buft = ggml_backend_metal_buffer_type();
1500
- #elif defined(GGML_USE_CUBLAS)
1579
+ #elif defined(GGML_USE_CUDA)
1501
1580
  buft = ggml_backend_cuda_buffer_type(gpu);
1502
1581
  #elif defined(GGML_USE_VULKAN)
1503
1582
  buft = ggml_backend_vk_buffer_type(gpu);
@@ -1523,7 +1602,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1523
1602
  static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1524
1603
  ggml_backend_buffer_type_t buft = nullptr;
1525
1604
 
1526
- #ifdef GGML_USE_CUBLAS
1605
+ #ifdef GGML_USE_CUDA
1527
1606
  if (ggml_backend_cuda_get_device_count() > 1) {
1528
1607
  buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1529
1608
  }
@@ -1544,7 +1623,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1544
1623
  }
1545
1624
 
1546
1625
  static size_t llama_get_device_count() {
1547
- #if defined(GGML_USE_CUBLAS)
1626
+ #if defined(GGML_USE_CUDA)
1548
1627
  return ggml_backend_cuda_get_device_count();
1549
1628
  #elif defined(GGML_USE_SYCL)
1550
1629
  return ggml_backend_sycl_get_device_count();
@@ -1556,20 +1635,20 @@ static size_t llama_get_device_count() {
1556
1635
  }
1557
1636
 
1558
1637
  static size_t llama_get_device_memory(int device) {
1559
- #if defined(GGML_USE_CUBLAS)
1638
+ #if defined(GGML_USE_CUDA)
1560
1639
  size_t total;
1561
1640
  size_t free;
1562
- ggml_backend_cuda_get_device_memory(device, &total, &free);
1641
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
1563
1642
  return free;
1564
1643
  #elif defined(GGML_USE_SYCL)
1565
1644
  size_t total;
1566
1645
  size_t free;
1567
- ggml_backend_sycl_get_device_memory(device, &total, &free);
1646
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
1568
1647
  return free;
1569
1648
  #elif defined(GGML_USE_VULKAN)
1570
1649
  size_t total;
1571
1650
  size_t free;
1572
- ggml_backend_vk_get_device_memory(device, &total, &free);
1651
+ ggml_backend_vk_get_device_memory(device, &free, &total);
1573
1652
  return free;
1574
1653
  #else
1575
1654
  return 1;
@@ -1621,10 +1700,13 @@ enum e_model {
1621
1700
  MODEL_40B,
1622
1701
  MODEL_65B,
1623
1702
  MODEL_70B,
1703
+ MODEL_314B,
1624
1704
  MODEL_SMALL,
1625
1705
  MODEL_MEDIUM,
1626
1706
  MODEL_LARGE,
1627
1707
  MODEL_XL,
1708
+ MODEL_8x7B,
1709
+ MODEL_8x22B,
1628
1710
  };
1629
1711
 
1630
1712
  static const size_t kiB = 1024;
@@ -1738,6 +1820,7 @@ struct llama_cparams {
1738
1820
  uint32_t n_ctx; // context size used during inference
1739
1821
  uint32_t n_batch;
1740
1822
  uint32_t n_ubatch;
1823
+ uint32_t n_seq_max;
1741
1824
  uint32_t n_threads; // number of threads to use for generation
1742
1825
  uint32_t n_threads_batch; // number of threads to use for batch processing
1743
1826
 
@@ -1803,9 +1886,9 @@ struct llama_layer {
1803
1886
 
1804
1887
  // ff MoE
1805
1888
  struct ggml_tensor * ffn_gate_inp;
1806
- struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1807
- struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1808
- struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1889
+ struct ggml_tensor * ffn_gate_exps;
1890
+ struct ggml_tensor * ffn_down_exps;
1891
+ struct ggml_tensor * ffn_up_exps ;
1809
1892
 
1810
1893
  // ff bias
1811
1894
  struct ggml_tensor * ffn_down_b; // b2
@@ -1941,11 +2024,13 @@ struct llama_vocab {
1941
2024
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
1942
2025
 
1943
2026
  // default LLaMA special tokens
1944
- id special_bos_id = 1;
1945
- id special_eos_id = 2;
1946
- id special_unk_id = 0;
1947
- id special_sep_id = -1;
1948
- id special_pad_id = -1;
2027
+ id special_bos_id = 1;
2028
+ id special_eos_id = 2;
2029
+ id special_unk_id = 0;
2030
+ id special_sep_id = -1;
2031
+ id special_pad_id = -1;
2032
+ id special_cls_id = -1;
2033
+ id special_mask_id = -1;
1949
2034
 
1950
2035
  int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
1951
2036
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
@@ -2023,12 +2108,12 @@ struct llama_model {
2023
2108
  // the model memory buffers for the tensor data
2024
2109
  std::vector<ggml_backend_buffer_t> bufs;
2025
2110
 
2026
- // model memory mapped file
2027
- std::unique_ptr<llama_mmap> mapping;
2111
+ // model memory mapped files
2112
+ llama_mmaps mappings;
2028
2113
 
2029
2114
  // objects representing data potentially being locked in memory
2030
- std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
2031
- llama_mlock mlock_mmap;
2115
+ llama_mlocks mlock_bufs;
2116
+ llama_mlocks mlock_mmaps;
2032
2117
 
2033
2118
  // for quantize-stats only
2034
2119
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -2041,7 +2126,7 @@ struct llama_model {
2041
2126
  ggml_free(ctx);
2042
2127
  }
2043
2128
  for (ggml_backend_buffer_t buf : bufs) {
2044
- #ifdef GGML_USE_CUBLAS
2129
+ #ifdef GGML_USE_CUDA
2045
2130
  if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
2046
2131
  ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
2047
2132
  }
@@ -2060,10 +2145,6 @@ struct llama_context {
2060
2145
  ggml_backend_free(backend);
2061
2146
  }
2062
2147
 
2063
- #ifdef GGML_USE_VULKAN
2064
- ggml_vk_free_cpu_assist();
2065
- #endif
2066
-
2067
2148
  ggml_backend_buffer_free(buf_output);
2068
2149
  }
2069
2150
 
@@ -2100,20 +2181,20 @@ struct llama_context {
2100
2181
  // host buffer for the model output (logits and embeddings)
2101
2182
  ggml_backend_buffer_t buf_output = nullptr;
2102
2183
 
2103
- // decode output (2-dimensional array: [n_tokens][n_vocab])
2104
- size_t logits_size = 0;
2105
- float * logits = nullptr;
2184
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
2185
+ size_t logits_size = 0; // capacity (of floats) for logits
2186
+ float * logits = nullptr;
2187
+
2188
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2189
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2190
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
2106
2191
 
2107
- #ifndef NDEBUG
2108
- // guard against access to unset logits
2109
- std::vector<bool> logits_valid;
2110
- #endif
2111
2192
  bool logits_all = false;
2112
2193
 
2113
- // embeddings output (2-dimensional array: [n_tokens][n_embd])
2194
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
2114
2195
  // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
2115
- size_t embd_size = 0;
2116
- float * embd = nullptr;
2196
+ size_t embd_size = 0; // capacity (of floats) for embeddings
2197
+ float * embd = nullptr;
2117
2198
 
2118
2199
  // sequence embeddings output (map of [n_embd] vectors)
2119
2200
  // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
@@ -2130,14 +2211,15 @@ struct llama_context {
2130
2211
  struct ggml_tensor * inp_tokens; // I32 [n_batch]
2131
2212
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
2132
2213
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2214
+ struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2133
2215
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2134
- struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
2216
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2135
2217
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2136
2218
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2137
2219
  struct ggml_tensor * inp_cls; // I32 [n_batch]
2138
2220
  struct ggml_tensor * inp_s_copy; // I32 [kv_size]
2139
- struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
2140
- struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
2221
+ struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
2222
+ struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
2141
2223
 
2142
2224
  // control vectors
2143
2225
  struct llama_control_vector cvec;
@@ -2792,6 +2874,8 @@ namespace GGUFMeta {
2792
2874
  };
2793
2875
  }
2794
2876
 
2877
+ using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
2878
+
2795
2879
  struct llama_model_loader {
2796
2880
  int n_kv = 0;
2797
2881
  int n_tensors = 0;
@@ -2802,54 +2886,133 @@ struct llama_model_loader {
2802
2886
 
2803
2887
  bool use_mmap = false;
2804
2888
 
2805
- llama_file file;
2889
+ llama_files files;
2806
2890
  llama_ftype ftype;
2807
2891
  llama_fver fver;
2808
2892
 
2809
- std::unique_ptr<llama_mmap> mapping;
2893
+ llama_mmaps mappings;
2894
+
2895
+ // Holds information on a model weight
2896
+ struct llama_tensor_weight {
2897
+ uint16_t idx; // source file index
2898
+ size_t offs; // tensor data offset in the original file
2899
+
2900
+ ggml_tensor * tensor;
2901
+
2902
+ llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2903
+ const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2904
+ offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
2905
+ }
2906
+ };
2907
+ std::vector<llama_tensor_weight> weights;
2908
+
2810
2909
  std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
2811
2910
 
2812
- struct gguf_context * ctx_gguf = NULL;
2813
- struct ggml_context * ctx_meta = NULL;
2911
+ struct gguf_context * meta = NULL;
2912
+ std::vector<ggml_context *> contexts;
2814
2913
 
2815
2914
  std::string arch_name;
2816
2915
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2817
2916
 
2818
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
2917
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
2819
2918
  int trace = 0;
2820
2919
  if (getenv("LLAMA_TRACE")) {
2821
2920
  trace = atoi(getenv("LLAMA_TRACE"));
2822
2921
  }
2823
2922
 
2824
- struct gguf_init_params params = {
2825
- /*.no_alloc = */ true,
2826
- /*.ctx = */ &ctx_meta,
2827
- };
2828
-
2829
2923
  if (param_overrides_p != nullptr) {
2830
2924
  for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
2831
2925
  kv_overrides.insert({std::string(p->key), *p});
2832
2926
  }
2833
2927
  }
2834
2928
 
2835
- ctx_gguf = gguf_init_from_file(fname.c_str(), params);
2836
- if (!ctx_gguf) {
2929
+ struct ggml_context * ctx = NULL;
2930
+ struct gguf_init_params params = {
2931
+ /*.no_alloc = */ true,
2932
+ /*.ctx = */ &ctx,
2933
+ };
2934
+
2935
+ meta = gguf_init_from_file(fname.c_str(), params);
2936
+ if (!meta) {
2837
2937
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
2838
2938
  }
2839
2939
 
2840
2940
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2841
2941
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2842
2942
 
2843
- n_kv = gguf_get_n_kv(ctx_gguf);
2844
- n_tensors = gguf_get_n_tensors(ctx_gguf);
2943
+ // Save tensors data offset of the main file.
2944
+ // For subsidiary files, `meta` tensor data offset must not be used,
2945
+ // so we build a unified tensors index for weights.
2946
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2947
+ weights.emplace_back(0, cur->name, meta, cur);
2948
+ }
2949
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
2950
+ contexts.emplace_back(ctx);
2951
+
2952
+ uint16_t n_split = 0;
2953
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
2954
+
2955
+ // Load additional GGML contexts
2956
+ if (n_split > 1) {
2957
+ uint16_t idx = 0;
2958
+ get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
2959
+ if (idx != 0) {
2960
+ throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
2961
+ }
2962
+
2963
+ char split_prefix[PATH_MAX] = {0};
2964
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
2965
+ throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
2966
+ }
2967
+
2968
+ if (trace > 0) {
2969
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
2970
+ }
2971
+
2972
+ char split_path[PATH_MAX] = {0};
2973
+ for (idx = 1; idx < n_split; idx++) {
2974
+ llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
2975
+
2976
+ struct gguf_init_params split_params = {
2977
+ /*.no_alloc = */ true,
2978
+ /*.ctx = */ &ctx,
2979
+ };
2980
+ struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
2981
+ if (!ctx_gguf) {
2982
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
2983
+ }
2984
+
2985
+ // Save tensors data offset info of the shard.
2986
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2987
+ weights.emplace_back(idx, cur->name, ctx_gguf, cur);
2988
+ }
2989
+ files.emplace_back(new llama_file(split_path, "rb"));
2990
+ contexts.emplace_back(ctx);
2991
+
2992
+ gguf_free(ctx_gguf);
2993
+ }
2994
+
2995
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
2845
2996
 
2846
- fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
2997
+ // sanity check
2998
+ {
2999
+ const int n_tensors_loaded = (int) weights.size();
3000
+ if (n_tensors != n_tensors_loaded) {
3001
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
3002
+ }
3003
+ }
3004
+
3005
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
3006
+ }
3007
+
3008
+ n_kv = gguf_get_n_kv(meta);
3009
+ n_tensors = weights.size();
2847
3010
 
2848
- for (int i = 0; i < n_tensors; i++) {
2849
- const char * name = gguf_get_tensor_name(ctx_gguf, i);
2850
- struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
2851
- n_elements += ggml_nelements(t);
2852
- n_bytes += ggml_nbytes(t);
3011
+ fver = (enum llama_fver) gguf_get_version(meta);
3012
+
3013
+ for (auto & w : weights) {
3014
+ n_elements += ggml_nelements(w.tensor);
3015
+ n_bytes += ggml_nbytes(w.tensor);
2853
3016
  }
2854
3017
 
2855
3018
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -2864,7 +3027,8 @@ struct llama_model_loader {
2864
3027
  enum ggml_type type_max = GGML_TYPE_F32;
2865
3028
 
2866
3029
  for (int i = 0; i < n_tensors; i++) {
2867
- enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
3030
+ const ggml_tensor * tensor = weights.at(i).tensor;
3031
+ enum ggml_type type = tensor->type;
2868
3032
 
2869
3033
  n_type[type]++;
2870
3034
 
@@ -2874,8 +3038,8 @@ struct llama_model_loader {
2874
3038
  }
2875
3039
 
2876
3040
  if (trace > 0) {
2877
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2878
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
3041
+ const uint16_t sid = weights.at(i).idx;
3042
+ LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
2879
3043
  }
2880
3044
  }
2881
3045
 
@@ -2897,6 +3061,7 @@ struct llama_model_loader {
2897
3061
  case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2898
3062
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2899
3063
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
3064
+ case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
2900
3065
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2901
3066
  case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2902
3067
  case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
@@ -2911,22 +3076,23 @@ struct llama_model_loader {
2911
3076
  ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
2912
3077
 
2913
3078
  {
2914
- const int kid = gguf_find_key(ctx_gguf, "general.file_type");
3079
+ const int kid = gguf_find_key(meta, "general.file_type");
2915
3080
  if (kid >= 0) {
2916
- ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
3081
+ ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
2917
3082
  }
2918
3083
  }
2919
3084
 
2920
3085
  LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
3086
+
2921
3087
  for (int i = 0; i < n_kv; i++) {
2922
- const char * name = gguf_get_key(ctx_gguf, i);
2923
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
3088
+ const char * name = gguf_get_key(meta, i);
3089
+ const enum gguf_type type = gguf_get_kv_type(meta, i);
2924
3090
  const std::string type_name =
2925
3091
  type == GGUF_TYPE_ARRAY
2926
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
3092
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
2927
3093
  : gguf_type_name(type);
2928
3094
 
2929
- std::string value = gguf_kv_to_str(ctx_gguf, i);
3095
+ std::string value = gguf_kv_to_str(meta, i);
2930
3096
  const size_t MAX_VALUE_LEN = 40;
2931
3097
  if (value.size() > MAX_VALUE_LEN) {
2932
3098
  value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -2955,18 +3121,18 @@ struct llama_model_loader {
2955
3121
  }
2956
3122
 
2957
3123
  ~llama_model_loader() {
2958
- if (ctx_gguf) {
2959
- gguf_free(ctx_gguf);
3124
+ if (meta) {
3125
+ gguf_free(meta);
2960
3126
  }
2961
- if (ctx_meta) {
2962
- ggml_free(ctx_meta);
3127
+ for (auto * ctx : contexts) {
3128
+ ggml_free(ctx);
2963
3129
  }
2964
3130
  }
2965
3131
 
2966
3132
  template<typename T>
2967
3133
  typename std::enable_if<std::is_integral<T>::value, bool>::type
2968
3134
  get_arr_n(const std::string & key, T & result, const bool required = true) {
2969
- const int kid = gguf_find_key(ctx_gguf, key.c_str());
3135
+ const int kid = gguf_find_key(meta, key.c_str());
2970
3136
 
2971
3137
  if (kid < 0) {
2972
3138
  if (required) {
@@ -2976,7 +3142,7 @@ struct llama_model_loader {
2976
3142
  }
2977
3143
 
2978
3144
  struct GGUFMeta::ArrayInfo arr_info =
2979
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
3145
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
2980
3146
 
2981
3147
 
2982
3148
  result = arr_info.length;
@@ -2996,7 +3162,7 @@ struct llama_model_loader {
2996
3162
  const struct llama_model_kv_override * override =
2997
3163
  it != kv_overrides.end() ? &it->second : nullptr;
2998
3164
 
2999
- const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
3165
+ const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3000
3166
 
3001
3167
  if (required && !found) {
3002
3168
  throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -3019,28 +3185,57 @@ struct llama_model_loader {
3019
3185
  }
3020
3186
 
3021
3187
  const char * get_tensor_name(int i) const {
3022
- return gguf_get_tensor_name(ctx_gguf, i);
3188
+ return weights.at(i).tensor->name;
3189
+ }
3190
+
3191
+ const llama_tensor_weight * get_weight(const char * name) const {
3192
+ for (const auto & weight : weights) {
3193
+ if (strcmp(name, weight.tensor->name) == 0) {
3194
+ return &weight;
3195
+ }
3196
+ }
3197
+ return nullptr;
3198
+ }
3199
+
3200
+ const llama_tensor_weight & require_weight(const char * name) const {
3201
+ const llama_tensor_weight * weight = get_weight(name);
3202
+ if (!weight) {
3203
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3204
+ }
3205
+ return *weight;
3023
3206
  }
3024
3207
 
3025
3208
  struct ggml_tensor * get_tensor_meta(const char * name) const {
3026
- return ggml_get_tensor(ctx_meta, name);
3209
+ const auto * weight = get_weight(name);
3210
+ if (!weight) {
3211
+ return nullptr;
3212
+ }
3213
+ return weight->tensor;
3214
+ }
3215
+
3216
+ struct ggml_tensor * require_tensor_meta(const char * name) const {
3217
+ struct ggml_tensor * tensor = get_tensor_meta(name);
3218
+ if (!tensor) {
3219
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3220
+ }
3221
+ return tensor;
3027
3222
  }
3028
3223
 
3029
3224
  struct ggml_tensor * get_tensor_meta(int i) const {
3030
3225
  return get_tensor_meta(get_tensor_name(i));
3031
3226
  }
3032
3227
 
3033
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
3034
- struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
3035
- ggml_set_name(tensor, ggml_get_name(meta));
3228
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3229
+ struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3230
+ ggml_set_name(tensor, ggml_get_name(cur));
3036
3231
 
3037
3232
  n_created++;
3038
3233
 
3039
3234
  return tensor;
3040
3235
  }
3041
3236
 
3042
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3043
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
3237
+ const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
3238
+ const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
3044
3239
 
3045
3240
  if (cur == NULL) {
3046
3241
  if (!required) {
@@ -3051,8 +3246,8 @@ struct llama_model_loader {
3051
3246
 
3052
3247
  {
3053
3248
  bool is_ok = true;
3054
- for (size_t i = 0; i < ne.size(); ++i) {
3055
- if (ne[i] != cur->ne[i]) {
3249
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3250
+ if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
3056
3251
  is_ok = false;
3057
3252
  break;
3058
3253
  }
@@ -3066,127 +3261,196 @@ struct llama_model_loader {
3066
3261
  }
3067
3262
  }
3068
3263
 
3069
- return create_tensor_for(ctx, cur);
3264
+ return cur;
3070
3265
  }
3071
3266
 
3072
- void done_getting_tensors() const {
3073
- if (n_created != n_tensors) {
3074
- throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3267
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3268
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3269
+
3270
+ if (cur == NULL) {
3271
+ return NULL;
3075
3272
  }
3273
+
3274
+ return create_tensor_for(ctx, cur);
3076
3275
  }
3077
3276
 
3078
- size_t file_offset(const char * name) const {
3079
- const int idx = gguf_find_tensor(ctx_gguf, name);
3277
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
3278
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3080
3279
 
3081
- if (idx < 0) {
3082
- throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
3280
+ if (cur == NULL) {
3281
+ return NULL;
3083
3282
  }
3084
3283
 
3085
- return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
3086
- }
3284
+ if (cur->type != base->type) {
3285
+ throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
3286
+ }
3087
3287
 
3088
- void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
3089
- // prefetch the whole file - all the data is needed anyway
3090
- if (use_mmap) {
3091
- mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
3288
+ std::array<int64_t, GGML_MAX_DIMS> dims;
3289
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3290
+ dims[i] = i < ne.size() ? ne[i] : 1;
3092
3291
  }
3093
3292
 
3094
- // compute the total size of all tensors for progress reporting
3095
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
3096
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
3097
- size_data += ggml_nbytes(cur);
3293
+ struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
3294
+ dims[0], dims[1], dims[2], dims[3],
3295
+ cur->nb[1], cur->nb[2], cur->nb[3],
3296
+ offset);
3297
+
3298
+ ggml_set_name(tensor, name.c_str());
3299
+
3300
+ n_created++;
3301
+
3302
+ return tensor;
3303
+ }
3304
+
3305
+ void done_getting_tensors() const {
3306
+ if (n_created != n_tensors) {
3307
+ throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3098
3308
  }
3309
+ }
3099
3310
 
3100
- if (use_mmap && mapping) {
3101
- if (lmlock) {
3102
- lmlock->init(mapping->addr);
3311
+ void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
3312
+ if (use_mmap) {
3313
+ mappings.reserve(files.size());
3314
+ mmaps_used.reserve(files.size());
3315
+ for (const auto & file : files) {
3316
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3317
+ mmaps_used.emplace_back(mapping->size, 0);
3318
+ if (mlock_mmaps) {
3319
+ std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3320
+ mlock_mmap->init(mapping->addr);
3321
+ mlock_mmaps->emplace_back(std::move(mlock_mmap));
3322
+ }
3323
+ mappings.emplace_back(std::move(mapping));
3103
3324
  }
3104
- mmap_used_first = mapping->size;
3325
+ }
3326
+
3327
+ // compute the total size of all tensors for progress reporting
3328
+ for (auto & w : weights) {
3329
+ size_data += ggml_nbytes(w.tensor);
3105
3330
  }
3106
3331
  }
3107
3332
 
3108
- void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
3109
- GGML_ASSERT(mapping);
3333
+ void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3334
+ GGML_ASSERT(!mappings.empty());
3335
+ const auto & mapping = mappings.at(idx);
3110
3336
 
3111
3337
  *first = mapping->size;
3112
3338
  *last = 0;
3339
+ *addr = mapping->addr;
3113
3340
  for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
3114
- const size_t offs = file_offset(ggml_get_name(tensor));
3115
- *first = std::min(*first, offs);
3116
- *last = std::max(*last, offs + ggml_nbytes(tensor));
3341
+ try {
3342
+ const auto * weight = get_weight(ggml_get_name(tensor));
3343
+ if (!weight) {
3344
+ continue;
3345
+ }
3346
+ if (weight->idx != idx) {
3347
+ continue;
3348
+ }
3349
+ *first = std::min(*first, weight->offs);
3350
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
3351
+ } catch(...) {
3352
+ // the tensor is not in the model
3353
+ }
3117
3354
  }
3118
3355
  }
3119
3356
 
3120
3357
  // for backwards compatibility, does not support ggml-backend
3121
3358
  void load_data_for(struct ggml_tensor * cur) const {
3122
- const size_t offs = file_offset(ggml_get_name(cur));
3359
+ const auto & w = require_weight(ggml_get_name(cur));
3123
3360
 
3124
- if (use_mmap && mapping) {
3361
+ if (use_mmap) {
3362
+ const auto & mapping = mappings.at(w.idx);
3125
3363
  if (cur->data == nullptr) {
3126
- cur->data = (uint8_t *)mapping->addr + offs;
3364
+ cur->data = (uint8_t *)mapping->addr + w.offs;
3127
3365
  } else {
3128
- memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
3366
+ memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
3129
3367
  }
3130
3368
  } else {
3131
3369
  GGML_ASSERT(cur->data != nullptr);
3132
- file.seek(offs, SEEK_SET);
3133
- file.read_raw(cur->data, ggml_nbytes(cur));
3370
+ GGML_ASSERT(w.idx < files.size());
3371
+ const auto & file = files.at(w.idx);
3372
+ file->seek(w.offs, SEEK_SET);
3373
+ file->read_raw(cur->data, ggml_nbytes(cur));
3134
3374
  }
3135
3375
  }
3136
3376
 
3137
3377
  size_t size_done = 0;
3138
3378
  size_t size_data = 0;
3139
- size_t mmap_used_first = -1;
3140
- size_t mmap_used_last = 0;
3379
+ std::vector<std::pair<size_t, size_t>> mmaps_used;
3141
3380
 
3142
3381
  // Returns false if cancelled by progress_callback
3143
- bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
3144
- GGML_ASSERT(size_data != 0 && "call init_mapping() first");
3382
+ bool load_all_data(
3383
+ struct ggml_context * ctx,
3384
+ llama_buf_map & bufs_mmap,
3385
+ llama_mlocks * lmlocks,
3386
+ llama_progress_callback progress_callback,
3387
+ void * progress_callback_user_data) {
3388
+ GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3145
3389
 
3146
3390
  std::vector<no_init<uint8_t>> read_buf;
3147
-
3148
3391
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3392
+ const auto * weight = get_weight(ggml_get_name(cur));
3393
+ if (weight == nullptr) {
3394
+ // this can happen with split experts models
3395
+ continue;
3396
+ }
3397
+
3149
3398
  if (progress_callback) {
3150
3399
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
3151
3400
  return false;
3152
3401
  }
3153
3402
  }
3154
3403
 
3155
- const size_t offs = file_offset(ggml_get_name(cur));
3404
+ size_t n_size = ggml_nbytes(cur);
3156
3405
 
3157
- if (use_mmap && mapping) {
3406
+ if (use_mmap) {
3407
+ const auto & mapping = mappings.at(weight->idx);
3408
+ ggml_backend_buffer_t buf_mmap = nullptr;
3409
+ if (bufs_mmap.count(weight->idx)) {
3410
+ buf_mmap = bufs_mmap.at(weight->idx);
3411
+ }
3412
+ GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3158
3413
  if (buf_mmap && cur->data == nullptr) {
3159
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
3160
- if (lmlock) {
3161
- lmlock->grow_to(offs + ggml_nbytes(cur));
3414
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3415
+ if (lmlocks) {
3416
+ const auto & lmlock = lmlocks->at(weight->idx);
3417
+ lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3162
3418
  }
3163
- mmap_used_first = std::min(mmap_used_first, offs);
3164
- mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
3419
+
3420
+ auto & mmap_used = mmaps_used[weight->idx];
3421
+ mmap_used.first = std::min(mmap_used.first, weight->offs);
3422
+ mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3165
3423
  } else {
3166
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
3424
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3167
3425
  }
3168
3426
  } else {
3427
+ GGML_ASSERT(weight->idx < files.size());
3428
+ const auto & file = files.at(weight->idx);
3169
3429
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3170
- file.seek(offs, SEEK_SET);
3171
- file.read_raw(cur->data, ggml_nbytes(cur));
3430
+ file->seek(weight->offs, SEEK_SET);
3431
+ file->read_raw(cur->data, ggml_nbytes(cur));
3172
3432
  } else {
3173
3433
  read_buf.resize(ggml_nbytes(cur));
3174
- file.seek(offs, SEEK_SET);
3175
- file.read_raw(read_buf.data(), ggml_nbytes(cur));
3176
- ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
3434
+ file->seek(weight->offs, SEEK_SET);
3435
+ file->read_raw(read_buf.data(), ggml_nbytes(cur));
3436
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3177
3437
  }
3178
3438
  }
3179
3439
 
3180
- size_done += ggml_nbytes(cur);
3440
+ size_done += n_size;
3181
3441
  }
3182
3442
 
3183
3443
  // check if this is the last call and do final cleanup
3184
3444
  if (size_done >= size_data) {
3185
3445
  // unmap offloaded tensors and metadata
3186
- if (use_mmap && mapping) {
3187
- mapping->unmap_fragment(0, mmap_used_first);
3188
- if (mmap_used_last != 0) {
3189
- mapping->unmap_fragment(mmap_used_last, mapping->size);
3446
+ if (use_mmap) {
3447
+ for (uint32_t idx = 0; idx < mappings.size(); idx++) {
3448
+ const auto & mmap_used = mmaps_used.at(idx);
3449
+ auto & mapping = mappings.at(idx);
3450
+ mapping->unmap_fragment(0, mmap_used.first);
3451
+ if (mmap_used.second != 0) {
3452
+ mapping->unmap_fragment(mmap_used.second, mapping->size);
3453
+ }
3190
3454
  }
3191
3455
  }
3192
3456
  if (progress_callback) {
@@ -3259,6 +3523,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3259
3523
  case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
3260
3524
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
3261
3525
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
3526
+ case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
3262
3527
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
3263
3528
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
3264
3529
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
@@ -3290,10 +3555,13 @@ static const char * llama_model_type_name(e_model type) {
3290
3555
  case MODEL_40B: return "40B";
3291
3556
  case MODEL_65B: return "65B";
3292
3557
  case MODEL_70B: return "70B";
3558
+ case MODEL_314B: return "314B";
3293
3559
  case MODEL_SMALL: return "0.1B";
3294
3560
  case MODEL_MEDIUM: return "0.4B";
3295
3561
  case MODEL_LARGE: return "0.8B";
3296
3562
  case MODEL_XL: return "1.5B";
3563
+ case MODEL_8x7B: return "8x7B";
3564
+ case MODEL_8x22B: return "8x22B";
3297
3565
  default: return "?B";
3298
3566
  }
3299
3567
  }
@@ -3319,7 +3587,7 @@ static void llm_load_hparams(
3319
3587
  llama_model_loader & ml,
3320
3588
  llama_model & model) {
3321
3589
  auto & hparams = model.hparams;
3322
- const gguf_context * ctx = ml.ctx_gguf;
3590
+ const gguf_context * ctx = ml.meta;
3323
3591
 
3324
3592
  // get metadata as string
3325
3593
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -3408,15 +3676,23 @@ static void llm_load_hparams(
3408
3676
  {
3409
3677
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3410
3678
 
3411
- switch (hparams.n_layer) {
3412
- case 22: model.type = e_model::MODEL_1B; break;
3413
- case 26: model.type = e_model::MODEL_3B; break;
3414
- case 32: model.type = e_model::MODEL_7B; break;
3415
- case 40: model.type = e_model::MODEL_13B; break;
3416
- case 48: model.type = e_model::MODEL_34B; break;
3417
- case 60: model.type = e_model::MODEL_30B; break;
3418
- case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3419
- default: model.type = e_model::MODEL_UNKNOWN;
3679
+ if (hparams.n_expert == 8) {
3680
+ switch (hparams.n_layer) {
3681
+ case 32: model.type = e_model::MODEL_8x7B; break;
3682
+ case 56: model.type = e_model::MODEL_8x22B; break;
3683
+ default: model.type = e_model::MODEL_UNKNOWN;
3684
+ }
3685
+ } else {
3686
+ switch (hparams.n_layer) {
3687
+ case 22: model.type = e_model::MODEL_1B; break;
3688
+ case 26: model.type = e_model::MODEL_3B; break;
3689
+ case 32: model.type = e_model::MODEL_7B; break;
3690
+ case 40: model.type = e_model::MODEL_13B; break;
3691
+ case 48: model.type = e_model::MODEL_34B; break;
3692
+ case 60: model.type = e_model::MODEL_30B; break;
3693
+ case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3694
+ default: model.type = e_model::MODEL_UNKNOWN;
3695
+ }
3420
3696
  }
3421
3697
  } break;
3422
3698
  case LLM_ARCH_MINICPM:
@@ -3428,6 +3704,15 @@ static void llm_load_hparams(
3428
3704
  default: model.type = e_model::MODEL_UNKNOWN;
3429
3705
  }
3430
3706
  } break;
3707
+ case LLM_ARCH_GROK:
3708
+ {
3709
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3710
+
3711
+ switch (hparams.n_layer) {
3712
+ case 64: model.type = e_model::MODEL_314B; break;
3713
+ default: model.type = e_model::MODEL_UNKNOWN;
3714
+ }
3715
+ } break;
3431
3716
  case LLM_ARCH_FALCON:
3432
3717
  {
3433
3718
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3679,6 +3964,16 @@ static void llm_load_hparams(
3679
3964
  default: model.type = e_model::MODEL_UNKNOWN;
3680
3965
  }
3681
3966
  } break;
3967
+ case LLM_ARCH_XVERSE:
3968
+ {
3969
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3970
+ switch (hparams.n_layer) {
3971
+ case 32: model.type = e_model::MODEL_7B; break;
3972
+ case 40: model.type = e_model::MODEL_13B; break;
3973
+ case 80: model.type = e_model::MODEL_65B; break;
3974
+ default: model.type = e_model::MODEL_UNKNOWN;
3975
+ }
3976
+ } break;
3682
3977
  case LLM_ARCH_COMMAND_R:
3683
3978
  {
3684
3979
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -3701,7 +3996,9 @@ static void llm_load_hparams(
3701
3996
  }
3702
3997
 
3703
3998
  // TODO: This should probably be in llama.h
3704
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
3999
+ static std::vector<llama_vocab::id> llama_tokenize_internal(
4000
+ const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
4001
+ );
3705
4002
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
3706
4003
 
3707
4004
  static void llm_load_vocab(
@@ -3709,7 +4006,7 @@ static void llm_load_vocab(
3709
4006
  llama_model & model) {
3710
4007
  auto & vocab = model.vocab;
3711
4008
 
3712
- struct gguf_context * ctx = ml.ctx_gguf;
4009
+ struct gguf_context * ctx = ml.meta;
3713
4010
 
3714
4011
  const auto kv = LLM_KV(model.arch);
3715
4012
 
@@ -3723,23 +4020,27 @@ static void llm_load_vocab(
3723
4020
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
3724
4021
 
3725
4022
  // default special tokens
3726
- vocab.special_bos_id = -1;
3727
- vocab.special_eos_id = -1;
3728
- vocab.special_unk_id = -1;
3729
- vocab.special_sep_id = -1;
3730
- vocab.special_pad_id = -1;
3731
- vocab.linefeed_id = -1;
4023
+ vocab.special_bos_id = -1;
4024
+ vocab.special_eos_id = -1;
4025
+ vocab.special_unk_id = -1;
4026
+ vocab.special_sep_id = -1;
4027
+ vocab.special_pad_id = -1;
4028
+ vocab.special_cls_id = -1;
4029
+ vocab.special_mask_id = -1;
4030
+ vocab.linefeed_id = -1;
3732
4031
 
3733
4032
  return;
3734
4033
  } else if (tokenizer_name == "llama") {
3735
4034
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
3736
4035
 
3737
4036
  // default special tokens
3738
- vocab.special_bos_id = 1;
3739
- vocab.special_eos_id = 2;
3740
- vocab.special_unk_id = 0;
3741
- vocab.special_sep_id = -1;
3742
- vocab.special_pad_id = -1;
4037
+ vocab.special_bos_id = 1;
4038
+ vocab.special_eos_id = 2;
4039
+ vocab.special_unk_id = 0;
4040
+ vocab.special_sep_id = -1;
4041
+ vocab.special_pad_id = -1;
4042
+ vocab.special_cls_id = -1;
4043
+ vocab.special_mask_id = -1;
3743
4044
 
3744
4045
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
3745
4046
  if (add_space_prefix_keyidx != -1) {
@@ -3774,20 +4075,24 @@ static void llm_load_vocab(
3774
4075
  }
3775
4076
 
3776
4077
  // default special tokens
3777
- vocab.special_bos_id = 11;
3778
- vocab.special_eos_id = 11;
3779
- vocab.special_unk_id = -1;
3780
- vocab.special_sep_id = -1;
3781
- vocab.special_pad_id = -1;
4078
+ vocab.special_bos_id = 11;
4079
+ vocab.special_eos_id = 11;
4080
+ vocab.special_unk_id = -1;
4081
+ vocab.special_sep_id = -1;
4082
+ vocab.special_pad_id = -1;
4083
+ vocab.special_cls_id = -1;
4084
+ vocab.special_mask_id = -1;
3782
4085
  } else if (tokenizer_name == "bert") {
3783
4086
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
3784
4087
 
3785
4088
  // default special tokens
3786
- vocab.special_bos_id = 101;
3787
- vocab.special_eos_id = 102;
3788
- vocab.special_unk_id = 100;
3789
- vocab.special_sep_id = -1;
3790
- vocab.special_pad_id = -1;
4089
+ vocab.special_bos_id = -1;
4090
+ vocab.special_eos_id = -1;
4091
+ vocab.special_unk_id = 100;
4092
+ vocab.special_sep_id = 102;
4093
+ vocab.special_pad_id = 0;
4094
+ vocab.special_cls_id = 101;
4095
+ vocab.special_mask_id = 103;
3791
4096
  vocab.add_space_prefix = false;
3792
4097
  } else {
3793
4098
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
@@ -3842,7 +4147,7 @@ static void llm_load_vocab(
3842
4147
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3843
4148
  vocab.linefeed_id = vocab.special_pad_id;
3844
4149
  } else {
3845
- const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
4150
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
3846
4151
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
3847
4152
  vocab.linefeed_id = ids[0];
3848
4153
  }
@@ -3850,11 +4155,13 @@ static void llm_load_vocab(
3850
4155
  // special tokens
3851
4156
  {
3852
4157
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
3853
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
3854
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
3855
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
3856
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
3857
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4158
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4159
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4160
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4161
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4162
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4163
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4164
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
3858
4165
  };
3859
4166
  for (const auto & it : special_token_types) {
3860
4167
  const std::string & key = kv(std::get<0>(it));
@@ -4046,12 +4353,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4046
4353
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4047
4354
 
4048
4355
  // special tokens
4049
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4050
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4051
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4052
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4053
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4054
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4356
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4357
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4358
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4359
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4360
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4361
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4362
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4363
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4055
4364
  }
4056
4365
 
4057
4366
  // Returns false if cancelled by progress_callback
@@ -4075,6 +4384,7 @@ static bool llm_load_tensors(
4075
4384
 
4076
4385
  const int64_t n_layer = hparams.n_layer;
4077
4386
  const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
4387
+ bool use_mmap_buffer = true;
4078
4388
 
4079
4389
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
4080
4390
  model.buft_input = llama_default_buffer_type_cpu(true);
@@ -4163,6 +4473,10 @@ static bool llm_load_tensors(
4163
4473
 
4164
4474
  // create one context per buffer type
4165
4475
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4476
+
4477
+ // for moe merged tensors
4478
+ ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4479
+
4166
4480
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4167
4481
  for (auto & it : buft_layer_count) {
4168
4482
  struct ggml_init_params params = {
@@ -4189,6 +4503,11 @@ static bool llm_load_tensors(
4189
4503
  const int64_t n_vocab = hparams.n_vocab;
4190
4504
  const int64_t n_vocab_type = hparams.n_vocab_type;
4191
4505
  const int64_t n_ff = hparams.n_ff;
4506
+ const int64_t n_expert = hparams.n_expert;
4507
+
4508
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
4509
+ throw std::runtime_error("model has expert layers but no expert layers are used");
4510
+ }
4192
4511
 
4193
4512
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4194
4513
 
@@ -4243,26 +4562,113 @@ static bool llm_load_tensors(
4243
4562
 
4244
4563
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4245
4564
 
4246
- layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
4247
-
4248
- if (layer.ffn_gate_inp == nullptr) {
4249
- GGML_ASSERT(hparams.n_expert == 0);
4250
- GGML_ASSERT(hparams.n_expert_used == 0);
4251
-
4565
+ if (n_expert == 0) {
4252
4566
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4253
4567
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4254
4568
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4255
4569
  } else {
4256
- GGML_ASSERT(hparams.n_expert > 0);
4257
- GGML_ASSERT(hparams.n_expert_used > 0);
4258
-
4259
- // MoE branch
4260
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
4261
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
4262
- layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
4263
- layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
4570
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4571
+
4572
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4573
+ if (layer.ffn_gate_exps) {
4574
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4575
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4576
+ } else {
4577
+ // merge split expert into a single tensor for compatibility with older models
4578
+ // requires disabling mmap
4579
+ use_mmap_buffer = false;
4580
+
4581
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4582
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4583
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4584
+
4585
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4586
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4587
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4588
+
4589
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4590
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4591
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4592
+
4593
+ for (uint32_t x = 0; x < n_expert; ++x) {
4594
+ // the individual experts are loaded into a view of the merged tensor
4595
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4596
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4597
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4598
+ }
4599
+ }
4600
+ }
4601
+ }
4602
+ } break;
4603
+ case LLM_ARCH_GROK:
4604
+ {
4605
+ if (n_expert == 0) {
4606
+ throw std::runtime_error("Grok model cannot have zero experts");
4607
+ }
4608
+
4609
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4610
+
4611
+ // output
4612
+ {
4613
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4614
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4615
+ // if output is NULL, init from the input tok embed
4616
+ if (model.output == NULL) {
4617
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4618
+ ml.n_created--; // artificial tensor
4619
+ ml.size_data += ggml_nbytes(model.output);
4620
+ }
4621
+ }
4622
+
4623
+ for (int i = 0; i < n_layer; ++i) {
4624
+ ggml_context * ctx_layer = ctx_for_layer(i);
4625
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4626
+
4627
+ auto & layer = model.layers[i];
4628
+
4629
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4630
+
4631
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4632
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4633
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4634
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4635
+
4636
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4637
+
4638
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4639
+
4640
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4641
+
4642
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4643
+ if (layer.ffn_gate_exps) {
4644
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4645
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4646
+ } else {
4647
+ // merge split expert into a single tensor for compatibility with older models
4648
+ // requires disabling mmap
4649
+ use_mmap_buffer = false;
4650
+
4651
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4652
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4653
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4654
+
4655
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4656
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4657
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4658
+
4659
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4660
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4661
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4662
+
4663
+ for (uint32_t x = 0; x < n_expert; ++x) {
4664
+ // the individual experts are loaded into a view of the merged tensor
4665
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4666
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4667
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4264
4668
  }
4265
4669
  }
4670
+
4671
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4266
4672
  }
4267
4673
  } break;
4268
4674
  case LLM_ARCH_BAICHUAN:
@@ -4319,10 +4725,8 @@ static bool llm_load_tensors(
4319
4725
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4320
4726
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4321
4727
 
4322
- if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
4323
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
4324
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
4325
- }
4728
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
4729
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
4326
4730
 
4327
4731
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4328
4732
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -4502,6 +4906,7 @@ static bool llm_load_tensors(
4502
4906
  case LLM_ARCH_MPT:
4503
4907
  {
4504
4908
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4909
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
4505
4910
 
4506
4911
  // output
4507
4912
  {
@@ -4540,6 +4945,12 @@ static bool llm_load_tensors(
4540
4945
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4541
4946
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4542
4947
 
4948
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
4949
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
4950
+
4951
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
4952
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
4953
+
4543
4954
  // AWQ ScaleActivation layer
4544
4955
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
4545
4956
  }
@@ -4986,6 +5397,28 @@ static bool llm_load_tensors(
4986
5397
  layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
4987
5398
  }
4988
5399
  } break;
5400
+ case LLM_ARCH_XVERSE:
5401
+ {
5402
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5403
+ {
5404
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5405
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5406
+ }
5407
+ for (int i = 0; i < n_layer; ++i) {
5408
+ ggml_context * ctx_layer = ctx_for_layer(i);
5409
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5410
+ auto & layer = model.layers[i];
5411
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5412
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5413
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5414
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5415
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5416
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5417
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5418
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5419
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5420
+ }
5421
+ } break;
4989
5422
  case LLM_ARCH_COMMAND_R:
4990
5423
  {
4991
5424
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5007,6 +5440,11 @@ static bool llm_load_tensors(
5007
5440
 
5008
5441
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5009
5442
 
5443
+ if (n_layer >= 64){
5444
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
5445
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
5446
+ }
5447
+
5010
5448
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5011
5449
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5012
5450
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
@@ -5024,56 +5462,97 @@ static bool llm_load_tensors(
5024
5462
 
5025
5463
  ml.done_getting_tensors();
5026
5464
 
5027
- ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
5465
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
5466
+ model.mappings.reserve(ml.mappings.size());
5028
5467
 
5029
5468
  // create the backend buffers
5030
- std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
5469
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
5470
+ ctx_bufs.reserve(ctx_map.size());
5471
+
5472
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
5473
+ size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
5474
+ model.bufs.reserve(n_max_backend_buffer);
5031
5475
 
5032
5476
  for (auto & it : ctx_map) {
5033
5477
  ggml_backend_buffer_type_t buft = it.first;
5034
- ggml_context * ctx = it.second;
5035
- ggml_backend_buffer_t buf = nullptr;
5478
+ ggml_context * ctx = it.second;
5479
+
5480
+ llama_buf_map bufs;
5481
+ bufs.reserve(n_max_backend_buffer);
5036
5482
 
5037
5483
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
5038
5484
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
5039
5485
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
5040
- if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
5041
- size_t first, last;
5042
- ml.get_mapping_range(&first, &last, ctx);
5043
- buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
5044
- #ifdef GGML_USE_CUBLAS
5045
- if (n_layer >= n_gpu_layers) {
5046
- ggml_backend_cuda_register_host_buffer(
5486
+ if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
5487
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5488
+ void * addr = nullptr;
5489
+ size_t first, last;
5490
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5491
+ if (first >= last) {
5492
+ continue;
5493
+ }
5494
+ ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
5495
+ if (buf == nullptr) {
5496
+ throw std::runtime_error("unable to allocate backend CPU buffer");
5497
+ }
5498
+ model.bufs.push_back(buf);
5499
+ bufs.emplace(idx, buf);
5500
+ #ifdef GGML_USE_CUDA
5501
+ if (n_layer >= n_gpu_layers) {
5502
+ ggml_backend_cuda_register_host_buffer(
5047
5503
  ggml_backend_buffer_get_base(buf),
5048
5504
  ggml_backend_buffer_get_size(buf));
5049
- }
5505
+ }
5050
5506
  #endif
5507
+ }
5051
5508
  }
5052
5509
  #ifdef GGML_USE_METAL
5053
- else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
5054
- const size_t max_size = ggml_get_max_tensor_size(ctx);
5055
- size_t first, last;
5056
- ml.get_mapping_range(&first, &last, ctx);
5057
- buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
5510
+ else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
5511
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5512
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
5513
+ void * addr = nullptr;
5514
+ size_t first, last;
5515
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5516
+ if (first >= last) {
5517
+ continue;
5518
+ }
5519
+ ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
5520
+ if (buf == nullptr) {
5521
+ throw std::runtime_error("unable to allocate backend metal buffer");
5522
+ }
5523
+ model.bufs.push_back(buf);
5524
+ bufs.emplace(idx, buf);
5525
+ }
5058
5526
  }
5059
5527
  #endif
5060
5528
  else {
5061
- buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
5062
- if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
5529
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
5530
+ if (buf == nullptr) {
5531
+ throw std::runtime_error("unable to allocate backend buffer");
5532
+ }
5533
+ model.bufs.push_back(buf);
5534
+ if (use_mlock && ggml_backend_buffer_is_host(buf)) {
5063
5535
  model.mlock_bufs.emplace_back(new llama_mlock);
5064
5536
  auto & mlock_buf = model.mlock_bufs.back();
5065
5537
  mlock_buf->init (ggml_backend_buffer_get_base(buf));
5066
5538
  mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
5067
5539
  }
5540
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5541
+ bufs.emplace(idx, buf);
5542
+ }
5068
5543
  }
5069
- if (buf == nullptr) {
5544
+
5545
+ if (bufs.empty()) {
5070
5546
  throw std::runtime_error("failed to allocate buffer");
5071
5547
  }
5072
- // indicate that this buffer contains weights
5073
- // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5074
- ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5075
- model.bufs.push_back(buf);
5076
- ctx_bufs.emplace_back(ctx, buf);
5548
+
5549
+ for (auto & buf : bufs) {
5550
+ // indicate that this buffer contains weights
5551
+ // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5552
+ ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5553
+ }
5554
+
5555
+ ctx_bufs.emplace_back(ctx, bufs);
5077
5556
  }
5078
5557
 
5079
5558
  if (llama_supports_gpu_offload()) {
@@ -5105,13 +5584,17 @@ static bool llm_load_tensors(
5105
5584
  // load tensor data
5106
5585
  for (auto & it : ctx_bufs) {
5107
5586
  ggml_context * ctx = it.first;
5108
- ggml_backend_buffer_t buf = it.second;
5109
- if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
5587
+ auto & bufs = it.second;
5588
+ if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
5110
5589
  return false;
5111
5590
  }
5112
5591
  }
5113
5592
 
5114
- model.mapping = std::move(ml.mapping);
5593
+ if (use_mmap_buffer) {
5594
+ for (auto & mapping : ml.mappings) {
5595
+ model.mappings.emplace_back(std::move(mapping));
5596
+ }
5597
+ }
5115
5598
 
5116
5599
  // loading time will be recalculate after the first eval, so
5117
5600
  // we take page faults deferred by mmap() into consideration
@@ -5266,8 +5749,8 @@ static void llm_build_kv_store(
5266
5749
  GGML_ASSERT(kv.size == n_ctx);
5267
5750
 
5268
5751
  // compute the transposed [n_tokens, n_embd] V matrix
5269
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
5270
- //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
5752
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
5753
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
5271
5754
  cb(v_cur_t, "v_cur_t", il);
5272
5755
 
5273
5756
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
@@ -5451,6 +5934,20 @@ static struct ggml_tensor * llm_build_kqv(
5451
5934
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5452
5935
  }
5453
5936
 
5937
+ if (model.arch == LLM_ARCH_GROK) {
5938
+ // need to do the following:
5939
+ // multiply by attn_output_multiplyer of 0.08838834764831845
5940
+ // and then :
5941
+ // kq = 30 * tanh(kq / 30)
5942
+ // before the softmax below
5943
+
5944
+ //try from phi2
5945
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5946
+
5947
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
5948
+ kq = ggml_scale(ctx, kq, 30);
5949
+ }
5950
+
5454
5951
  #if defined(GGML_USE_KOMPUTE)
5455
5952
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
5456
5953
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
@@ -5577,7 +6074,8 @@ struct llm_build_context {
5577
6074
  const float norm_rms_eps;
5578
6075
 
5579
6076
  const int32_t n_tokens;
5580
- const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
6077
+ const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
6078
+ const int32_t n_outputs;
5581
6079
  const int32_t kv_head; // index of where we store new KV data in the cache
5582
6080
  const int32_t n_orig_ctx;
5583
6081
 
@@ -5624,6 +6122,7 @@ struct llm_build_context {
5624
6122
  norm_rms_eps (hparams.f_norm_rms_eps),
5625
6123
  n_tokens (batch.n_tokens),
5626
6124
  n_kv (worst_case ? kv_self.size : kv_self.n),
6125
+ n_outputs (worst_case ? n_tokens : lctx.n_outputs),
5627
6126
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
5628
6127
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5629
6128
  pooling_type (cparams.pooling_type),
@@ -5645,6 +6144,7 @@ struct llm_build_context {
5645
6144
  lctx.inp_tokens = nullptr;
5646
6145
  lctx.inp_embd = nullptr;
5647
6146
  lctx.inp_pos = nullptr;
6147
+ lctx.inp_out_ids = nullptr;
5648
6148
  lctx.inp_KQ_mask = nullptr;
5649
6149
  lctx.inp_KQ_pos = nullptr;
5650
6150
  lctx.inp_K_shift = nullptr;
@@ -5768,6 +6268,13 @@ struct llm_build_context {
5768
6268
  return lctx.inp_pos;
5769
6269
  }
5770
6270
 
6271
+ struct ggml_tensor * build_inp_out_ids() {
6272
+ lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
6273
+ cb(lctx.inp_out_ids, "inp_out_ids", -1);
6274
+ ggml_set_input(lctx.inp_out_ids);
6275
+ return lctx.inp_out_ids;
6276
+ }
6277
+
5771
6278
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
5772
6279
  if (causal) {
5773
6280
  lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
@@ -5824,6 +6331,9 @@ struct llm_build_context {
5824
6331
  struct ggml_cgraph * build_llama() {
5825
6332
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5826
6333
 
6334
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6335
+ int32_t n_tokens = this->n_tokens;
6336
+
5827
6337
  const int64_t n_embd_head = hparams.n_embd_head_v;
5828
6338
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5829
6339
  GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5891,6 +6401,14 @@ struct llm_build_context {
5891
6401
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5892
6402
  }
5893
6403
 
6404
+ if (il == n_layer - 1) {
6405
+ // skip computing output for unused tokens
6406
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6407
+ n_tokens = n_outputs;
6408
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6409
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6410
+ }
6411
+
5894
6412
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5895
6413
  cb(ffn_inp, "ffn_inp", il);
5896
6414
 
@@ -5943,19 +6461,19 @@ struct llm_build_context {
5943
6461
  for (int i = 0; i < n_expert_used; ++i) {
5944
6462
  ggml_tensor * cur_expert;
5945
6463
 
5946
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
6464
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
5947
6465
  cb(cur_up, "ffn_moe_up", il);
5948
6466
 
5949
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
6467
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
5950
6468
  cb(cur_gate, "ffn_moe_gate", il);
5951
6469
 
5952
6470
  cur_gate = ggml_silu(ctx0, cur_gate);
5953
6471
  cb(cur_gate, "ffn_moe_silu", il);
5954
6472
 
5955
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
6473
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
5956
6474
  cb(cur_expert, "ffn_moe_gate_par", il);
5957
6475
 
5958
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6476
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
5959
6477
  cb(cur_expert, "ffn_moe_down", il);
5960
6478
 
5961
6479
  cur_expert = ggml_mul(ctx0, cur_expert,
@@ -6070,6 +6588,13 @@ struct llm_build_context {
6070
6588
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6071
6589
  }
6072
6590
 
6591
+ if (il == n_layer - 1) {
6592
+ // skip computing output for unused tokens
6593
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6594
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6595
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6596
+ }
6597
+
6073
6598
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6074
6599
  cb(ffn_inp, "ffn_inp", il);
6075
6600
 
@@ -6112,6 +6637,111 @@ struct llm_build_context {
6112
6637
  return gf;
6113
6638
  }
6114
6639
 
6640
+ struct ggml_cgraph * build_xverse() {
6641
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6642
+
6643
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6644
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6645
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6646
+
6647
+ struct ggml_tensor * cur;
6648
+ struct ggml_tensor * inpL;
6649
+
6650
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6651
+
6652
+ // inp_pos - contains the positions
6653
+ struct ggml_tensor * inp_pos = build_inp_pos();
6654
+
6655
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6656
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6657
+
6658
+ // positions of the tokens in the KV cache
6659
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6660
+
6661
+ for (int il = 0; il < n_layer; ++il) {
6662
+ struct ggml_tensor * inpSA = inpL;
6663
+
6664
+ cur = llm_build_norm(ctx0, inpL, hparams,
6665
+ model.layers[il].attn_norm, NULL,
6666
+ LLM_NORM_RMS, cb, il);
6667
+ cb(cur, "attn_norm", il);
6668
+
6669
+ // self-attention
6670
+ {
6671
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6672
+ cb(Qcur, "Qcur", il);
6673
+
6674
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6675
+ cb(Kcur, "Kcur", il);
6676
+
6677
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6678
+ cb(Vcur, "Vcur", il);
6679
+
6680
+ Qcur = ggml_rope_custom(
6681
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6682
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6683
+ ext_factor, attn_factor, beta_fast, beta_slow
6684
+ );
6685
+ cb(Qcur, "Qcur", il);
6686
+
6687
+ Kcur = ggml_rope_custom(
6688
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6689
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6690
+ ext_factor, attn_factor, beta_fast, beta_slow
6691
+ );
6692
+ cb(Kcur, "Kcur", il);
6693
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6694
+ model.layers[il].wo, NULL,
6695
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6696
+ }
6697
+
6698
+ if (il == n_layer - 1) {
6699
+ // skip computing output for unused tokens
6700
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6701
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6702
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6703
+ }
6704
+
6705
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6706
+ cb(ffn_inp, "ffn_inp", il);
6707
+
6708
+ // feed-forward network
6709
+ {
6710
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6711
+ model.layers[il].ffn_norm, NULL,
6712
+ LLM_NORM_RMS, cb, il);
6713
+ cb(cur, "ffn_norm", il);
6714
+
6715
+ cur = llm_build_ffn(ctx0, cur,
6716
+ model.layers[il].ffn_up, NULL,
6717
+ model.layers[il].ffn_gate, NULL,
6718
+ model.layers[il].ffn_down, NULL,
6719
+ NULL,
6720
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6721
+ cb(cur, "ffn_out", il);
6722
+ }
6723
+
6724
+ cur = ggml_add(ctx0, cur, ffn_inp);
6725
+ cb(cur, "l_out", il);
6726
+
6727
+ // input for next layer
6728
+ inpL = cur;
6729
+ }
6730
+
6731
+ cur = inpL;
6732
+
6733
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
6734
+ cb(cur, "result_norm", -1);
6735
+
6736
+ // lm_head
6737
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6738
+ cb(cur, "result_output", -1);
6739
+
6740
+ ggml_build_forward_expand(gf, cur);
6741
+
6742
+ return gf;
6743
+ }
6744
+
6115
6745
  struct ggml_cgraph * build_falcon() {
6116
6746
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6117
6747
 
@@ -6185,6 +6815,14 @@ struct llm_build_context {
6185
6815
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6186
6816
  }
6187
6817
 
6818
+ if (il == n_layer - 1) {
6819
+ // skip computing output for unused tokens
6820
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6821
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6822
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6823
+ attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
6824
+ }
6825
+
6188
6826
  struct ggml_tensor * ffn_inp = cur;
6189
6827
 
6190
6828
  // feed forward
@@ -6225,144 +6863,359 @@ struct llm_build_context {
6225
6863
  return gf;
6226
6864
  }
6227
6865
 
6228
- struct ggml_cgraph * build_starcoder() {
6866
+ struct ggml_cgraph * build_grok() {
6229
6867
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6230
6868
 
6869
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6870
+ int32_t n_tokens = this->n_tokens;
6871
+
6231
6872
  const int64_t n_embd_head = hparams.n_embd_head_v;
6232
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6233
6873
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6874
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6234
6875
 
6235
6876
  struct ggml_tensor * cur;
6236
6877
  struct ggml_tensor * inpL;
6237
6878
 
6238
6879
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6239
6880
 
6881
+ // multiply by embedding_multiplier_scale of 78.38367176906169
6882
+ inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
6883
+
6240
6884
  // inp_pos - contains the positions
6241
6885
  struct ggml_tensor * inp_pos = build_inp_pos();
6242
6886
 
6243
6887
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6244
6888
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6245
6889
 
6246
- struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
6247
- cb(pos, "pos_embd", -1);
6248
-
6249
- inpL = ggml_add(ctx0, inpL, pos);
6250
- cb(inpL, "inpL", -1);
6251
-
6252
6890
  for (int il = 0; il < n_layer; ++il) {
6891
+ struct ggml_tensor * inpSA = inpL;
6892
+
6893
+ // norm
6253
6894
  cur = llm_build_norm(ctx0, inpL, hparams,
6254
- model.layers[il].attn_norm,
6255
- model.layers[il].attn_norm_b,
6256
- LLM_NORM, cb, il);
6895
+ model.layers[il].attn_norm, NULL,
6896
+ LLM_NORM_RMS, cb, il);
6257
6897
  cb(cur, "attn_norm", il);
6258
6898
 
6899
+
6259
6900
  // self-attention
6260
6901
  {
6261
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6262
- cb(cur, "wqkv", il);
6902
+ // compute Q and K and RoPE them
6903
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6904
+ cb(Qcur, "Qcur", il);
6905
+ if (model.layers[il].bq) {
6906
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6907
+ cb(Qcur, "Qcur", il);
6908
+ }
6263
6909
 
6264
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6265
- cb(cur, "bqkv", il);
6910
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6911
+ cb(Kcur, "Kcur", il);
6912
+ if (model.layers[il].bk) {
6913
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6914
+ cb(Kcur, "Kcur", il);
6915
+ }
6266
6916
 
6267
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6268
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6269
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6917
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6918
+ cb(Vcur, "Vcur", il);
6919
+ if (model.layers[il].bv) {
6920
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6921
+ cb(Vcur, "Vcur", il);
6922
+ }
6270
6923
 
6924
+ Qcur = ggml_rope_custom(
6925
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6926
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6927
+ ext_factor, attn_factor, beta_fast, beta_slow
6928
+ );
6271
6929
  cb(Qcur, "Qcur", il);
6272
- cb(Kcur, "Kcur", il);
6273
- cb(Vcur, "Vcur", il);
6274
6930
 
6275
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6931
+ Kcur = ggml_rope_custom(
6932
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6933
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6934
+ ext_factor, attn_factor, beta_fast, beta_slow
6935
+ );
6936
+ cb(Kcur, "Kcur", il);
6276
6937
 
6277
6938
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6278
6939
  model.layers[il].wo, model.layers[il].bo,
6279
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6940
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6280
6941
  }
6281
6942
 
6282
- // add the input
6283
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6284
- cb(ffn_inp, "ffn_inp", il);
6285
-
6286
- // FF
6287
- {
6288
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
6289
- model.layers[il].ffn_norm,
6290
- model.layers[il].ffn_norm_b,
6291
- LLM_NORM, cb, il);
6292
- cb(cur, "ffn_norm", il);
6943
+ if (il == n_layer - 1) {
6944
+ // skip computing output for unused tokens
6945
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6946
+ n_tokens = n_outputs;
6947
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6948
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6949
+ }
6293
6950
 
6294
- cur = llm_build_ffn(ctx0, cur,
6295
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6296
- NULL, NULL,
6297
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6298
- NULL,
6299
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6300
- cb(cur, "ffn_out", il);
6951
+ // Grok
6952
+ // if attn_out_norm is present then apply it before adding the input
6953
+ if (model.layers[il].attn_out_norm) {
6954
+ cur = llm_build_norm(ctx0, cur, hparams,
6955
+ model.layers[il].attn_out_norm, NULL,
6956
+ LLM_NORM_RMS, cb, il);
6957
+ cb(cur, "attn_out_norm", il);
6301
6958
  }
6302
6959
 
6303
- inpL = ggml_add(ctx0, cur, ffn_inp);
6304
- cb(inpL, "l_out", il);
6305
- }
6960
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6961
+ cb(ffn_inp, "ffn_inp", il);
6306
6962
 
6307
- cur = llm_build_norm(ctx0, inpL, hparams,
6308
- model.output_norm,
6309
- model.output_norm_b,
6310
- LLM_NORM, cb, -1);
6311
- cb(cur, "result_norm", -1);
6963
+ // feed-forward network
6964
+ // MoE branch
6965
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6966
+ model.layers[il].ffn_norm, NULL,
6967
+ LLM_NORM_RMS, cb, il);
6968
+ cb(cur, "ffn_norm", il);
6312
6969
 
6313
- cur = ggml_mul_mat(ctx0, model.output, cur);
6314
- cb(cur, "result_output", -1);
6970
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6971
+ cb(logits, "ffn_moe_logits", il);
6315
6972
 
6316
- ggml_build_forward_expand(gf, cur);
6973
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6974
+ cb(probs, "ffn_moe_probs", il);
6317
6975
 
6318
- return gf;
6319
- }
6976
+ // select experts
6977
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6978
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6320
6979
 
6321
- struct ggml_cgraph * build_persimmon() {
6322
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6980
+ ggml_tensor * weights = ggml_get_rows(ctx0,
6981
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6982
+ cb(weights, "ffn_moe_weights", il);
6323
6983
 
6324
- const int64_t n_embd_head = hparams.n_embd_head_v;
6325
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6326
- GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
6984
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6327
6985
 
6328
- struct ggml_tensor * cur;
6329
- struct ggml_tensor * inpL;
6986
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6987
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6330
6988
 
6331
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6989
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6990
+ cb(weights, "ffn_moe_weights_norm", il);
6332
6991
 
6333
- // inp_pos - contains the positions
6334
- struct ggml_tensor * inp_pos = build_inp_pos();
6992
+ // compute expert outputs
6993
+ ggml_tensor * moe_out = nullptr;
6335
6994
 
6336
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6337
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6995
+ for (int i = 0; i < n_expert_used; ++i) {
6996
+ ggml_tensor * cur_expert;
6338
6997
 
6339
- for (int il = 0; il < n_layer; ++il) {
6340
- struct ggml_tensor * residual = inpL;
6998
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6999
+ cb(cur_up, "ffn_moe_up", il);
6341
7000
 
6342
- cur = llm_build_norm(ctx0, inpL, hparams,
6343
- model.layers[il].attn_norm,
6344
- model.layers[il].attn_norm_b,
6345
- LLM_NORM, cb, il);
6346
- cb(cur, "attn_norm", il);
7001
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
7002
+ cb(cur_gate, "ffn_moe_gate", il);
6347
7003
 
6348
- // self attention
6349
- {
6350
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6351
- cb(cur, "wqkv", il);
7004
+ //GeLU
7005
+ cur_gate = ggml_gelu(ctx0, cur_gate);
7006
+ cb(cur_gate, "ffn_moe_gelu", il);
6352
7007
 
6353
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6354
- cb(cur, "bqkv", il);
7008
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
7009
+ cb(cur_expert, "ffn_moe_gate_par", il);
6355
7010
 
6356
- // split qkv
6357
- GGML_ASSERT(n_head_kv == n_head);
7011
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
7012
+ cb(cur_expert, "ffn_moe_down", il);
6358
7013
 
6359
- struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
6360
- cb(tmpqkv, "tmpqkv", il);
7014
+ cur_expert = ggml_mul(ctx0, cur_expert,
7015
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
7016
+ cb(cur_expert, "ffn_moe_weighted", il);
6361
7017
 
6362
- struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
6363
- cb(tmpqkv_perm, "tmpqkv", il);
7018
+ if (i == 0) {
7019
+ moe_out = cur_expert;
7020
+ } else {
7021
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
7022
+ cb(moe_out, "ffn_moe_out", il);
7023
+ }
7024
+ }
6364
7025
 
6365
- struct ggml_tensor * tmpq = ggml_view_3d(
7026
+ cur = moe_out;
7027
+
7028
+ // Grok
7029
+ // if layer_out_norm is present then apply it before adding the input
7030
+ // Idea: maybe ffn_out_norm is a better name
7031
+ if (model.layers[il].layer_out_norm) {
7032
+ cur = llm_build_norm(ctx0, cur, hparams,
7033
+ model.layers[il].layer_out_norm, NULL,
7034
+ LLM_NORM_RMS, cb, il);
7035
+ cb(cur, "layer_out_norm", il);
7036
+ }
7037
+
7038
+
7039
+ cur = ggml_add(ctx0, cur, ffn_inp);
7040
+ cb(cur, "ffn_out", il);
7041
+
7042
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7043
+ if (layer_dir != nullptr) {
7044
+ cur = ggml_add(ctx0, cur, layer_dir);
7045
+ }
7046
+ cb(cur, "l_out", il);
7047
+
7048
+ // input for next layer
7049
+ inpL = cur;
7050
+ }
7051
+
7052
+ cur = inpL;
7053
+
7054
+ cur = llm_build_norm(ctx0, cur, hparams,
7055
+ model.output_norm, NULL,
7056
+ LLM_NORM_RMS, cb, -1);
7057
+ cb(cur, "result_norm", -1);
7058
+
7059
+ // lm_head
7060
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7061
+
7062
+ // Grok
7063
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
7064
+
7065
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7066
+
7067
+ cb(cur, "result_output", -1);
7068
+
7069
+ ggml_build_forward_expand(gf, cur);
7070
+
7071
+ return gf;
7072
+ }
7073
+
7074
+ struct ggml_cgraph * build_starcoder() {
7075
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7076
+
7077
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7078
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7079
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7080
+
7081
+ struct ggml_tensor * cur;
7082
+ struct ggml_tensor * inpL;
7083
+
7084
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7085
+
7086
+ // inp_pos - contains the positions
7087
+ struct ggml_tensor * inp_pos = build_inp_pos();
7088
+
7089
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7090
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7091
+
7092
+ struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7093
+ cb(pos, "pos_embd", -1);
7094
+
7095
+ inpL = ggml_add(ctx0, inpL, pos);
7096
+ cb(inpL, "inpL", -1);
7097
+
7098
+ for (int il = 0; il < n_layer; ++il) {
7099
+ cur = llm_build_norm(ctx0, inpL, hparams,
7100
+ model.layers[il].attn_norm,
7101
+ model.layers[il].attn_norm_b,
7102
+ LLM_NORM, cb, il);
7103
+ cb(cur, "attn_norm", il);
7104
+
7105
+ // self-attention
7106
+ {
7107
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7108
+ cb(cur, "wqkv", il);
7109
+
7110
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7111
+ cb(cur, "bqkv", il);
7112
+
7113
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7114
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7115
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7116
+
7117
+ cb(Qcur, "Qcur", il);
7118
+ cb(Kcur, "Kcur", il);
7119
+ cb(Vcur, "Vcur", il);
7120
+
7121
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7122
+
7123
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7124
+ model.layers[il].wo, model.layers[il].bo,
7125
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7126
+ }
7127
+
7128
+ if (il == n_layer - 1) {
7129
+ // skip computing output for unused tokens
7130
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7131
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7132
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7133
+ }
7134
+
7135
+ // add the input
7136
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7137
+ cb(ffn_inp, "ffn_inp", il);
7138
+
7139
+ // FF
7140
+ {
7141
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7142
+ model.layers[il].ffn_norm,
7143
+ model.layers[il].ffn_norm_b,
7144
+ LLM_NORM, cb, il);
7145
+ cb(cur, "ffn_norm", il);
7146
+
7147
+ cur = llm_build_ffn(ctx0, cur,
7148
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7149
+ NULL, NULL,
7150
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7151
+ NULL,
7152
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
7153
+ cb(cur, "ffn_out", il);
7154
+ }
7155
+
7156
+ inpL = ggml_add(ctx0, cur, ffn_inp);
7157
+ cb(inpL, "l_out", il);
7158
+ }
7159
+
7160
+ cur = llm_build_norm(ctx0, inpL, hparams,
7161
+ model.output_norm,
7162
+ model.output_norm_b,
7163
+ LLM_NORM, cb, -1);
7164
+ cb(cur, "result_norm", -1);
7165
+
7166
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7167
+ cb(cur, "result_output", -1);
7168
+
7169
+ ggml_build_forward_expand(gf, cur);
7170
+
7171
+ return gf;
7172
+ }
7173
+
7174
+ struct ggml_cgraph * build_persimmon() {
7175
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7176
+
7177
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7178
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7179
+ GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
7180
+
7181
+ struct ggml_tensor * cur;
7182
+ struct ggml_tensor * inpL;
7183
+
7184
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7185
+
7186
+ // inp_pos - contains the positions
7187
+ struct ggml_tensor * inp_pos = build_inp_pos();
7188
+
7189
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7190
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7191
+
7192
+ for (int il = 0; il < n_layer; ++il) {
7193
+ struct ggml_tensor * residual = inpL;
7194
+
7195
+ cur = llm_build_norm(ctx0, inpL, hparams,
7196
+ model.layers[il].attn_norm,
7197
+ model.layers[il].attn_norm_b,
7198
+ LLM_NORM, cb, il);
7199
+ cb(cur, "attn_norm", il);
7200
+
7201
+ // self attention
7202
+ {
7203
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7204
+ cb(cur, "wqkv", il);
7205
+
7206
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7207
+ cb(cur, "bqkv", il);
7208
+
7209
+ // split qkv
7210
+ GGML_ASSERT(n_head_kv == n_head);
7211
+
7212
+ struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
7213
+ cb(tmpqkv, "tmpqkv", il);
7214
+
7215
+ struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
7216
+ cb(tmpqkv_perm, "tmpqkv", il);
7217
+
7218
+ struct ggml_tensor * tmpq = ggml_view_3d(
6366
7219
  ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
6367
7220
  ggml_element_size(tmpqkv_perm) * n_embd_head,
6368
7221
  ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
@@ -6476,6 +7329,13 @@ struct llm_build_context {
6476
7329
  Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6477
7330
  }
6478
7331
 
7332
+ if (il == n_layer - 1) {
7333
+ // skip computing output for unused tokens
7334
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7335
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7336
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
7337
+ }
7338
+
6479
7339
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
6480
7340
  cb(ffn_inp, "ffn_inp", il);
6481
7341
 
@@ -6565,6 +7425,13 @@ struct llm_build_context {
6565
7425
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6566
7426
  }
6567
7427
 
7428
+ if (il == n_layer - 1) {
7429
+ // skip computing output for unused tokens
7430
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7431
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7432
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7433
+ }
7434
+
6568
7435
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6569
7436
  cb(ffn_inp, "ffn_inp", il);
6570
7437
 
@@ -6722,6 +7589,13 @@ struct llm_build_context {
6722
7589
  }
6723
7590
  cb(cur, "kqv_out", il);
6724
7591
 
7592
+ if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
7593
+ // skip computing output for unused tokens
7594
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7595
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7596
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7597
+ }
7598
+
6725
7599
  // re-add the layer input
6726
7600
  cur = ggml_add(ctx0, cur, inpL);
6727
7601
 
@@ -6844,6 +7718,13 @@ struct llm_build_context {
6844
7718
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6845
7719
  }
6846
7720
 
7721
+ if (il == n_layer - 1) {
7722
+ // skip computing output for unused tokens
7723
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7724
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7725
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7726
+ }
7727
+
6847
7728
  // Add the input
6848
7729
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6849
7730
  cb(ffn_inp, "ffn_inp", il);
@@ -6891,6 +7772,7 @@ struct llm_build_context {
6891
7772
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6892
7773
 
6893
7774
  struct ggml_tensor * cur;
7775
+ struct ggml_tensor * pos;
6894
7776
  struct ggml_tensor * inpL;
6895
7777
 
6896
7778
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -6901,6 +7783,16 @@ struct llm_build_context {
6901
7783
  // positions of the tokens in the KV cache
6902
7784
  struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6903
7785
 
7786
+ if (model.pos_embd) {
7787
+ // inp_pos - contains the positions
7788
+ struct ggml_tensor * inp_pos = build_inp_pos();
7789
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7790
+ cb(pos, "pos_embd", -1);
7791
+
7792
+ inpL = ggml_add(ctx0, inpL, pos);
7793
+ cb(inpL, "inpL", -1);
7794
+ }
7795
+
6904
7796
  for (int il = 0; il < n_layer; ++il) {
6905
7797
  struct ggml_tensor * attn_norm;
6906
7798
 
@@ -6935,11 +7827,39 @@ struct llm_build_context {
6935
7827
  cb(Kcur, "Kcur", il);
6936
7828
  cb(Vcur, "Vcur", il);
6937
7829
 
6938
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7830
+ // Q/K Layernorm
7831
+ if (model.layers[il].attn_q_norm) {
7832
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
7833
+ model.layers[il].attn_q_norm,
7834
+ model.layers[il].attn_q_norm_b,
7835
+ LLM_NORM, cb, il);
7836
+ cb(Qcur, "Qcur", il);
6939
7837
 
6940
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7838
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
7839
+ model.layers[il].attn_k_norm,
7840
+ model.layers[il].attn_k_norm_b,
7841
+ LLM_NORM, cb, il);
7842
+ cb(Kcur, "Kcur", il);
7843
+
7844
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7845
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7846
+
7847
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6941
7848
  model.layers[il].wo, model.layers[il].bo,
6942
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7849
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7850
+ } else {
7851
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7852
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7853
+ model.layers[il].wo, model.layers[il].bo,
7854
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7855
+ }
7856
+ }
7857
+
7858
+ if (il == n_layer - 1) {
7859
+ // skip computing output for unused tokens
7860
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7861
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7862
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6943
7863
  }
6944
7864
 
6945
7865
  // Add the input
@@ -7055,6 +7975,13 @@ struct llm_build_context {
7055
7975
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7056
7976
  }
7057
7977
 
7978
+ if (il == n_layer - 1) {
7979
+ // skip computing output for unused tokens
7980
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7981
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7982
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7983
+ }
7984
+
7058
7985
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7059
7986
  cb(ffn_inp, "ffn_inp", il);
7060
7987
 
@@ -7161,6 +8088,13 @@ struct llm_build_context {
7161
8088
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7162
8089
  }
7163
8090
 
8091
+ if (il == n_layer - 1) {
8092
+ // skip computing output for unused tokens
8093
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8094
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8095
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8096
+ }
8097
+
7164
8098
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7165
8099
  cb(ffn_inp, "ffn_inp", il);
7166
8100
 
@@ -7273,6 +8207,13 @@ struct llm_build_context {
7273
8207
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7274
8208
  }
7275
8209
 
8210
+ if (il == n_layer - 1) {
8211
+ // skip computing output for unused tokens
8212
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8213
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8214
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8215
+ }
8216
+
7276
8217
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7277
8218
  cb(ffn_inp, "ffn_inp", il);
7278
8219
 
@@ -7391,6 +8332,14 @@ struct llm_build_context {
7391
8332
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7392
8333
  }
7393
8334
 
8335
+ if (il == n_layer - 1) {
8336
+ // skip computing output for unused tokens
8337
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8338
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8339
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8340
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
8341
+ }
8342
+
7394
8343
  // FF
7395
8344
  {
7396
8345
  ffn_output = llm_build_ffn(ctx0, attn_norm_output,
@@ -7488,6 +8437,14 @@ struct llm_build_context {
7488
8437
 
7489
8438
  cur = attention_norm;
7490
8439
 
8440
+ if (il == n_layer - 1) {
8441
+ // skip computing output for unused tokens
8442
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8443
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8444
+ sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
8445
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8446
+ }
8447
+
7491
8448
  // feed-forward network
7492
8449
  {
7493
8450
  cur = llm_build_ffn(ctx0, cur,
@@ -7580,6 +8537,13 @@ struct llm_build_context {
7580
8537
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7581
8538
  }
7582
8539
 
8540
+ if (il == n_layer - 1) {
8541
+ // skip computing output for unused tokens
8542
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8543
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8544
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8545
+ }
8546
+
7583
8547
  // add the input
7584
8548
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7585
8549
  cb(ffn_inp, "ffn_inp", il);
@@ -7680,6 +8644,13 @@ struct llm_build_context {
7680
8644
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7681
8645
  }
7682
8646
 
8647
+ if (il == n_layer - 1) {
8648
+ // skip computing output for unused tokens
8649
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8650
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8651
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8652
+ }
8653
+
7683
8654
  // add the input
7684
8655
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7685
8656
  cb(ffn_inp, "ffn_inp", il);
@@ -7789,6 +8760,13 @@ struct llm_build_context {
7789
8760
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7790
8761
  }
7791
8762
 
8763
+ if (il == n_layer - 1) {
8764
+ // skip computing output for unused tokens
8765
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8766
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8767
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8768
+ }
8769
+
7792
8770
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7793
8771
  cb(ffn_inp, "ffn_inp", il);
7794
8772
 
@@ -7899,6 +8877,13 @@ struct llm_build_context {
7899
8877
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7900
8878
  }
7901
8879
 
8880
+ if (il == n_layer - 1) {
8881
+ // skip computing output for unused tokens
8882
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8883
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8884
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8885
+ }
8886
+
7902
8887
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7903
8888
  cb(ffn_inp, "ffn_inp", il);
7904
8889
 
@@ -8022,6 +9007,13 @@ struct llm_build_context {
8022
9007
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8023
9008
  }
8024
9009
 
9010
+ if (il == n_layer - 1) {
9011
+ // skip computing output for unused tokens
9012
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9013
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9014
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9015
+ }
9016
+
8025
9017
  // scale_res - scale the hidden states for residual connection
8026
9018
  const float scale_res = scale_depth/sqrtf(float(n_layer));
8027
9019
  cur = ggml_scale(ctx0, cur, scale_res);
@@ -8136,6 +9128,13 @@ struct llm_build_context {
8136
9128
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8137
9129
  }
8138
9130
 
9131
+ if (il == n_layer - 1) {
9132
+ // skip computing output for unused tokens
9133
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9134
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9135
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9136
+ }
9137
+
8139
9138
  struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8140
9139
  cb(sa_out, "sa_out", il);
8141
9140
 
@@ -8248,6 +9247,13 @@ struct llm_build_context {
8248
9247
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8249
9248
  }
8250
9249
 
9250
+ if (il == n_layer - 1) {
9251
+ // skip computing output for unused tokens
9252
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9253
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9254
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9255
+ }
9256
+
8251
9257
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8252
9258
  cb(ffn_inp, "ffn_inp", il);
8253
9259
 
@@ -8395,6 +9401,15 @@ struct llm_build_context {
8395
9401
 
8396
9402
  struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
8397
9403
 
9404
+ if (il == n_layer - 1) {
9405
+ // skip computing output for unused tokens
9406
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9407
+ x = ggml_get_rows(ctx0, x, inp_out_ids);
9408
+ y = ggml_get_rows(ctx0, y, inp_out_ids);
9409
+ z = ggml_get_rows(ctx0, z, inp_out_ids);
9410
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9411
+ }
9412
+
8398
9413
  // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
8399
9414
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
8400
9415
  y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
@@ -8478,6 +9493,31 @@ struct llm_build_context {
8478
9493
  cb(Vcur, "Vcur", il);
8479
9494
  }
8480
9495
 
9496
+ if (model.layers[il].attn_q_norm) {
9497
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
9498
+ ggml_element_size(Qcur) * n_embd_head,
9499
+ ggml_element_size(Qcur) * n_embd_head * n_head,
9500
+ 0);
9501
+ cb(Qcur, "Qcur", il);
9502
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
9503
+ ggml_element_size(Kcur) * n_embd_head,
9504
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
9505
+ 0);
9506
+ cb(Kcur, "Kcur", il);
9507
+
9508
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
9509
+ model.layers[il].attn_q_norm,
9510
+ NULL,
9511
+ LLM_NORM, cb, il);
9512
+ cb(Qcur, "Qcur", il);
9513
+
9514
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
9515
+ model.layers[il].attn_k_norm,
9516
+ NULL,
9517
+ LLM_NORM, cb, il);
9518
+ cb(Kcur, "Kcur", il);
9519
+ }
9520
+
8481
9521
  Qcur = ggml_rope_custom(
8482
9522
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8483
9523
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -8497,6 +9537,14 @@ struct llm_build_context {
8497
9537
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8498
9538
  }
8499
9539
 
9540
+ if (il == n_layer - 1) {
9541
+ // skip computing output for unused tokens
9542
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9543
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9544
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9545
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
9546
+ }
9547
+
8500
9548
  struct ggml_tensor * attn_out = cur;
8501
9549
 
8502
9550
  // feed-forward network
@@ -8648,6 +9696,10 @@ static struct ggml_cgraph * llama_build_graph(
8648
9696
  {
8649
9697
  result = llm.build_falcon();
8650
9698
  } break;
9699
+ case LLM_ARCH_GROK:
9700
+ {
9701
+ result = llm.build_grok();
9702
+ } break;
8651
9703
  case LLM_ARCH_STARCODER:
8652
9704
  {
8653
9705
  result = llm.build_starcoder();
@@ -8725,6 +9777,10 @@ static struct ggml_cgraph * llama_build_graph(
8725
9777
  {
8726
9778
  result = llm.build_mamba();
8727
9779
  } break;
9780
+ case LLM_ARCH_XVERSE:
9781
+ {
9782
+ result = llm.build_xverse();
9783
+ } break;
8728
9784
  case LLM_ARCH_COMMAND_R:
8729
9785
  {
8730
9786
  result = llm.build_command_r();
@@ -8790,9 +9846,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8790
9846
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
8791
9847
  }
8792
9848
 
9849
+ if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
9850
+ GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
9851
+ const int64_t n_tokens = batch.n_tokens;
9852
+
9853
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
9854
+ int32_t * data = (int32_t *) lctx.inp_out_ids->data;
9855
+
9856
+ if (lctx.n_outputs == n_tokens) {
9857
+ for (int i = 0; i < n_tokens; ++i) {
9858
+ data[i] = i;
9859
+ }
9860
+ } else if (batch.logits) {
9861
+ int32_t n_outputs = 0;
9862
+ for (int i = 0; i < n_tokens; ++i) {
9863
+ if (batch.logits[i]) {
9864
+ data[n_outputs++] = i;
9865
+ }
9866
+ }
9867
+ // the graph needs to have been passed the correct number of outputs
9868
+ GGML_ASSERT(lctx.n_outputs == n_outputs);
9869
+ } else if (lctx.n_outputs == 1) {
9870
+ // only keep last output
9871
+ data[0] = n_tokens - 1;
9872
+ } else {
9873
+ GGML_ASSERT(lctx.n_outputs == 0);
9874
+ }
9875
+ }
9876
+
8793
9877
  GGML_ASSERT(
9878
+ // (!a || b) is a logical implication (a -> b)
9879
+ // !hparams.causal_attn -> !cparams.causal_attn
8794
9880
  (hparams.causal_attn || !cparams.causal_attn) &&
8795
- "non-causal attention with generative models is not supported"
9881
+ "causal attention with embedding models is not supported"
8796
9882
  );
8797
9883
 
8798
9884
  if (lctx.inp_KQ_mask) {
@@ -8971,7 +10057,75 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8971
10057
  }
8972
10058
  }
8973
10059
 
8974
- static void llama_graph_compute(
10060
+ // Make sure enough space is available for outputs.
10061
+ // Returns max number of outputs for which space was reserved.
10062
+ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
10063
+ const auto & cparams = lctx.cparams;
10064
+ const auto & hparams = lctx.model.hparams;
10065
+
10066
+ const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
10067
+
10068
+ const auto n_batch = cparams.n_batch;
10069
+ const auto n_vocab = hparams.n_vocab;
10070
+ const auto n_embd = hparams.n_embd;
10071
+
10072
+ // TODO: use a per-batch flag for logits presence instead
10073
+ const bool has_logits = cparams.causal_attn;
10074
+ const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
10075
+
10076
+ const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
10077
+ const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
10078
+
10079
+ if (lctx.output_ids.empty()) {
10080
+ // init, never resized afterwards
10081
+ lctx.output_ids.resize(n_batch);
10082
+ }
10083
+
10084
+ const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
10085
+ const size_t new_size = (logits_size + embd_size) * sizeof(float);
10086
+
10087
+ // alloc only when more than the current capacity is required
10088
+ // TODO: also consider shrinking the buffer
10089
+ if (!lctx.buf_output || prev_size < new_size) {
10090
+ if (lctx.buf_output) {
10091
+ #ifndef NDEBUG
10092
+ // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
10093
+ LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
10094
+ #endif
10095
+ ggml_backend_buffer_free(lctx.buf_output);
10096
+ lctx.buf_output = nullptr;
10097
+ lctx.logits = nullptr;
10098
+ lctx.embd = nullptr;
10099
+ }
10100
+
10101
+ lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
10102
+ if (lctx.buf_output == nullptr) {
10103
+ LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
10104
+ return 0;
10105
+ }
10106
+ }
10107
+
10108
+ float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
10109
+
10110
+ lctx.logits = has_logits ? output_base : nullptr;
10111
+ lctx.embd = has_embd ? output_base + logits_size : nullptr;
10112
+
10113
+ lctx.output_size = n_outputs_max;
10114
+ lctx.logits_size = logits_size;
10115
+ lctx.embd_size = embd_size;
10116
+
10117
+ // set all ids as invalid (negative)
10118
+ std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
10119
+
10120
+ ggml_backend_buffer_clear(lctx.buf_output, 0);
10121
+
10122
+ lctx.n_outputs = 0;
10123
+
10124
+ return n_outputs_max;
10125
+ }
10126
+
10127
+
10128
+ static void llama_graph_compute(
8975
10129
  llama_context & lctx,
8976
10130
  ggml_cgraph * gf,
8977
10131
  int n_threads) {
@@ -9046,16 +10200,8 @@ static int llama_decode_internal(
9046
10200
  const int64_t n_embd = hparams.n_embd;
9047
10201
  const int64_t n_vocab = hparams.n_vocab;
9048
10202
 
9049
-
9050
- auto * logits_out = lctx.logits;
9051
-
9052
- #ifndef NDEBUG
9053
- auto & logits_valid = lctx.logits_valid;
9054
- logits_valid.clear();
9055
- logits_valid.resize(n_tokens_all);
9056
-
9057
- memset(logits_out, 0, lctx.logits_size*sizeof(float));
9058
- #endif
10203
+ uint32_t n_outputs = 0;
10204
+ uint32_t n_outputs_prev = 0;
9059
10205
 
9060
10206
  const auto n_ubatch = cparams.n_ubatch;
9061
10207
 
@@ -9064,6 +10210,38 @@ static int llama_decode_internal(
9064
10210
  std::vector<llama_seq_id *> seq_id_arr;
9065
10211
  std::vector<std::vector<llama_seq_id>> seq_id;
9066
10212
 
10213
+ // count outputs
10214
+ if (batch_all.logits) {
10215
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10216
+ n_outputs += batch_all.logits[i] != 0;
10217
+ }
10218
+ } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
10219
+ n_outputs = n_tokens_all;
10220
+ } else {
10221
+ // keep last output only
10222
+ n_outputs = 1;
10223
+ }
10224
+
10225
+ // reserve output buffer
10226
+ if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
10227
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
10228
+ return -2;
10229
+ };
10230
+
10231
+ // set output mappings
10232
+ if (batch_all.logits) {
10233
+ int32_t i_logits = 0;
10234
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10235
+ if (batch_all.logits[i]) {
10236
+ lctx.output_ids[i] = i_logits++;
10237
+ }
10238
+ }
10239
+ } else {
10240
+ for (uint32_t i = 0; i < n_outputs; ++i) {
10241
+ lctx.output_ids[i] = i;
10242
+ }
10243
+ }
10244
+
9067
10245
  for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
9068
10246
  const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
9069
10247
  llama_batch u_batch = {
@@ -9079,6 +10257,27 @@ static int llama_decode_internal(
9079
10257
  /* .all_seq_id = */ batch_all.all_seq_id,
9080
10258
  };
9081
10259
 
10260
+ // count the outputs in this u_batch
10261
+ {
10262
+ int32_t n_outputs_new = 0;
10263
+
10264
+ if (u_batch.logits) {
10265
+ for (uint32_t i = 0; i < n_tokens; i++) {
10266
+ n_outputs_new += u_batch.logits[i] != 0;
10267
+ }
10268
+ } else if (n_outputs == n_tokens_all) {
10269
+ n_outputs_new = n_tokens;
10270
+ } else {
10271
+ // keep last output only
10272
+ if (cur_token + n_tokens >= n_tokens_all) {
10273
+ n_outputs_new = 1;
10274
+ }
10275
+ }
10276
+
10277
+ // needs to happen before the graph is built
10278
+ lctx.n_outputs = n_outputs_new;
10279
+ }
10280
+
9082
10281
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
9083
10282
  GGML_ASSERT(n_threads > 0);
9084
10283
 
@@ -9142,23 +10341,37 @@ static int llama_decode_internal(
9142
10341
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
9143
10342
  struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
9144
10343
 
9145
- if (!hparams.causal_attn) {
10344
+ if (lctx.n_outputs == 0) {
10345
+ // no output
10346
+ res = nullptr;
10347
+ embd = nullptr;
10348
+ } else if (!hparams.causal_attn) {
9146
10349
  res = nullptr; // do not extract logits for embedding models such as BERT
9147
10350
 
9148
10351
  // token or sequence embeddings
9149
10352
  embd = gf->nodes[gf->n_nodes - 1];
9150
10353
 
9151
10354
  GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
9152
- } else {
9153
- if (strcmp(res->name, "result_output") == 0) {
9154
- // the token embeddings could be the second to last tensor, or the third to last tensor
9155
- if (strcmp(embd->name, "result_norm") != 0) {
9156
- embd = gf->nodes[gf->n_nodes - 3];
9157
- GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
9158
- }
9159
- } else {
9160
- GGML_ASSERT(false && "missing result_output tensor");
10355
+ } else if (cparams.embeddings) {
10356
+ // the embeddings could be in the second to last tensor, or any of the previous tensors
10357
+ int i_embd = gf->n_nodes - 2;
10358
+ for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
10359
+ i_embd = gf->n_nodes - i;
10360
+ if (i_embd < 0) { break; }
10361
+ embd = gf->nodes[i_embd];
9161
10362
  }
10363
+ GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
10364
+
10365
+ // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
10366
+ if (!cparams.causal_attn) {
10367
+ res = nullptr; // do not extract logits when not needed
10368
+ // skip computing logits
10369
+ // TODO: is this safe?
10370
+ gf->n_nodes = i_embd + 1;
10371
+ }
10372
+ } else {
10373
+ embd = nullptr; // do not extract embeddings when not needed
10374
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
9162
10375
  }
9163
10376
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
9164
10377
 
@@ -9201,50 +10414,23 @@ static int llama_decode_internal(
9201
10414
  //}
9202
10415
 
9203
10416
  // extract logits
9204
- // TODO: do not compute and extract logits if only embeddings are needed
9205
- // update the graphs to skip "result_output" if logits are not needed
9206
10417
  if (res) {
9207
10418
  ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
9208
10419
  GGML_ASSERT(backend_res != nullptr);
9209
- if (u_batch.logits) {
9210
- int32_t i_first = -1;
9211
- for (uint32_t i = 0; i < n_tokens; i++) {
9212
- if (u_batch.logits[i] && i_first == -1) {
9213
- i_first = (int32_t) i;
9214
- }
9215
- if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
9216
- if (i_first != -1) {
9217
- int i_last = u_batch.logits[i] == 0 ? i : i + 1;
9218
- // extract logits for the range [i_first, i_last)
9219
- // group the requests to minimize the number of calls to the backend
9220
- ggml_backend_tensor_get_async(backend_res, res,
9221
- logits_out + n_vocab*(cur_token + i_first),
9222
- i_first*n_vocab*sizeof(float),
9223
- (i_last - i_first)*n_vocab*sizeof(float));
9224
- i_first = -1;
9225
- }
9226
- }
9227
- #ifndef NDEBUG
9228
- logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
9229
- #endif
9230
- }
9231
- } else if (lctx.logits_all) {
9232
- ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
9233
- #ifndef NDEBUG
9234
- std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
9235
- #endif
9236
- } else {
9237
- if (cur_token + n_tokens >= n_tokens_all) {
9238
- ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
9239
- #ifndef NDEBUG
9240
- logits_valid[0] = true;
9241
- #endif
9242
- }
10420
+ GGML_ASSERT(lctx.logits != nullptr);
10421
+
10422
+ float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
10423
+ const int32_t n_outputs_new = lctx.n_outputs;
10424
+
10425
+ if (n_outputs_new) {
10426
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10427
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
10428
+ ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
9243
10429
  }
9244
10430
  }
9245
10431
 
9246
10432
  // extract embeddings
9247
- if (cparams.embeddings && embd) {
10433
+ if (embd) {
9248
10434
  ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
9249
10435
  GGML_ASSERT(backend_embd != nullptr);
9250
10436
 
@@ -9252,16 +10438,14 @@ static int llama_decode_internal(
9252
10438
  case LLAMA_POOLING_TYPE_NONE:
9253
10439
  {
9254
10440
  // extract token embeddings
9255
- auto & embd_out = lctx.embd;
9256
-
9257
- if (u_batch.logits) {
9258
- //embd_out.resize(n_embd * n_tokens);
9259
- for (uint32_t i = 0; i < n_tokens; i++) {
9260
- if (u_batch.logits[i] == 0) {
9261
- continue;
9262
- }
9263
- ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
9264
- }
10441
+ GGML_ASSERT(lctx.embd != nullptr);
10442
+ float * embd_out = lctx.embd + n_outputs_prev*n_embd;
10443
+ const int32_t n_outputs_new = lctx.n_outputs;
10444
+
10445
+ if (n_outputs_new) {
10446
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10447
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
10448
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
9265
10449
  }
9266
10450
  } break;
9267
10451
  case LLAMA_POOLING_TYPE_CLS:
@@ -9288,8 +10472,12 @@ static int llama_decode_internal(
9288
10472
  } break;
9289
10473
  }
9290
10474
  }
10475
+ n_outputs_prev += lctx.n_outputs;
9291
10476
  }
9292
10477
 
10478
+ // set to total number of outputs in the batch, for use in llama_get_logits_ith
10479
+ lctx.n_outputs = n_outputs;
10480
+
9293
10481
  // wait for the computation to finish (automatically done when obtaining the model output)
9294
10482
  //llama_synchronize(&lctx);
9295
10483
 
@@ -9933,7 +11121,7 @@ struct llm_tokenizer_bpe {
9933
11121
  add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
9934
11122
  }
9935
11123
 
9936
- // add the fnished tokens to the final list keeping correct order for next and prev
11124
+ // add the finished tokens to the final list keeping correct order for next and prev
9937
11125
  for (auto & sym : symbols) {
9938
11126
  if (sym.n > 0) {
9939
11127
  sym.prev = final_prev_index;
@@ -10202,9 +11390,6 @@ struct llm_tokenizer_wpm {
10202
11390
  output.push_back(vocab.special_unk_id);
10203
11391
  }
10204
11392
  }
10205
-
10206
- // append eos token
10207
- output.push_back(vocab.special_eos_id);
10208
11393
  }
10209
11394
 
10210
11395
  std::vector<std::string> preprocess(const std::string & text) {
@@ -10218,7 +11403,7 @@ struct llm_tokenizer_wpm {
10218
11403
  if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
10219
11404
  continue;
10220
11405
  }
10221
- code = to_lower(code);
11406
+ code = unicode_tolower(code);
10222
11407
  if (type == CODEPOINT_TYPE_WHITESPACE) {
10223
11408
  code = ' ';
10224
11409
  }
@@ -10238,7 +11423,7 @@ struct llm_tokenizer_wpm {
10238
11423
  std::vector<std::string> words;
10239
11424
  while (r < new_str.size()) {
10240
11425
  // if is whitespace
10241
- if (isspace(new_str[r])) {
11426
+ if (isspace(new_str[r], std::locale::classic())) {
10242
11427
  if (r > l) words.push_back(new_str.substr(l, (r - l)));
10243
11428
  l = r + 1;
10244
11429
  r = l;
@@ -10252,18 +11437,12 @@ struct llm_tokenizer_wpm {
10252
11437
  return words;
10253
11438
  }
10254
11439
 
10255
- uint32_t to_lower(uint32_t code) {
10256
- static const std::locale locale("en_US.UTF-8");
10257
- #if defined(_WIN32)
10258
- if (code > 0xFFFF) {
10259
- return code;
10260
- }
10261
- #endif
10262
- return std::tolower(wchar_t(code), locale);
10263
- }
10264
-
10265
11440
  bool is_ascii_punct(uint32_t code) {
10266
- return code < 256 && ispunct(code);
11441
+ if (code > 0xFF) {
11442
+ return false;
11443
+ }
11444
+ auto c = char(static_cast<unsigned char>(code));
11445
+ return ispunct(c, std::locale::classic());
10267
11446
  }
10268
11447
 
10269
11448
  bool is_chinese_char(uint32_t cpt) {
@@ -10415,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
10415
11594
  }
10416
11595
  }
10417
11596
 
10418
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
11597
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
10419
11598
  std::vector<llama_vocab::id> output;
10420
-
10421
- // OG tokenizer behavior:
10422
- //
10423
- // tokenizer.encode('', add_bos=True) returns [1]
10424
- // tokenizer.encode('', add_bos=False) returns []
10425
-
10426
- if (bos && vocab.special_bos_id != -1) {
10427
- output.push_back(vocab.special_bos_id);
10428
- }
10429
-
10430
- if (raw_text.empty()) {
10431
- return output;
10432
- }
10433
-
10434
11599
  std::forward_list<fragment_buffer_variant> fragment_buffer;
10435
- fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
10436
11600
 
10437
- if (special) tokenizer_st_partition(vocab, fragment_buffer);
11601
+ if (!raw_text.empty()) {
11602
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
11603
+ if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
11604
+ }
10438
11605
 
10439
11606
  switch (vocab.type) {
10440
11607
  case LLAMA_VOCAB_TYPE_SPM:
10441
11608
  {
11609
+ // OG tokenizer behavior:
11610
+ //
11611
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
11612
+ // tokenizer.encode('', add_special_tokens=False) returns []
11613
+
11614
+ if (add_special && vocab.special_add_bos != 0) {
11615
+ GGML_ASSERT(vocab.special_bos_id != -1);
11616
+ output.push_back(vocab.special_bos_id);
11617
+ }
11618
+
10442
11619
  for (const auto & fragment : fragment_buffer) {
10443
11620
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
10444
11621
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -10464,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
10464
11641
  output.push_back(fragment.token);
10465
11642
  }
10466
11643
  }
11644
+
11645
+ if (add_special && vocab.special_add_eos == 1) {
11646
+ GGML_ASSERT(vocab.special_eos_id != -1);
11647
+ output.push_back(vocab.special_eos_id);
11648
+ }
10467
11649
  } break;
10468
11650
  case LLAMA_VOCAB_TYPE_BPE:
10469
11651
  {
11652
+ if (add_special && vocab.special_add_bos == 1) {
11653
+ GGML_ASSERT(vocab.special_bos_id != -1);
11654
+ output.push_back(vocab.special_bos_id);
11655
+ }
11656
+
10470
11657
  for (const auto & fragment : fragment_buffer) {
10471
11658
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
10472
11659
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -10480,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
10480
11667
  output.push_back(fragment.token);
10481
11668
  }
10482
11669
  }
11670
+
11671
+ GGML_ASSERT(vocab.special_add_eos != 1);
10483
11672
  } break;
10484
11673
  case LLAMA_VOCAB_TYPE_WPM:
10485
11674
  {
11675
+ if (add_special) {
11676
+ GGML_ASSERT(vocab.special_cls_id != -1);
11677
+ output.push_back(vocab.special_cls_id);
11678
+ }
11679
+
10486
11680
  for (const auto & fragment : fragment_buffer) {
10487
11681
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
10488
11682
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -10496,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
10496
11690
  output.push_back(fragment.token);
10497
11691
  }
10498
11692
  }
11693
+
11694
+ if (add_special) {
11695
+ GGML_ASSERT(vocab.special_sep_id != -1);
11696
+ output.push_back(vocab.special_sep_id);
11697
+ }
10499
11698
  } break;
10500
11699
  case LLAMA_VOCAB_TYPE_NONE:
10501
11700
  GGML_ASSERT(false);
@@ -10508,28 +11707,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
10508
11707
  // grammar - internal
10509
11708
  //
10510
11709
 
10511
- struct llama_partial_utf8 {
10512
- uint32_t value; // bit value so far (unshifted)
10513
- int n_remain; // num bytes remaining; -1 indicates invalid sequence
10514
- };
10515
-
10516
- struct llama_grammar {
10517
- const std::vector<std::vector<llama_grammar_element>> rules;
10518
- std::vector<std::vector<const llama_grammar_element *>> stacks;
10519
-
10520
- // buffer for partially generated UTF-8 sequence from accepted tokens
10521
- llama_partial_utf8 partial_utf8;
10522
- };
10523
-
10524
- struct llama_grammar_candidate {
10525
- size_t index;
10526
- const uint32_t * code_points;
10527
- llama_partial_utf8 partial_utf8;
10528
- };
10529
11710
 
10530
11711
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
10531
11712
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
10532
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
11713
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
10533
11714
  const std::string & src,
10534
11715
  llama_partial_utf8 partial_start) {
10535
11716
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -10680,7 +11861,9 @@ static void llama_grammar_advance_stack(
10680
11861
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
10681
11862
 
10682
11863
  if (stack.empty()) {
10683
- new_stacks.emplace_back(stack);
11864
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
11865
+ new_stacks.emplace_back(stack);
11866
+ }
10684
11867
  return;
10685
11868
  }
10686
11869
 
@@ -10717,7 +11900,10 @@ static void llama_grammar_advance_stack(
10717
11900
  }
10718
11901
  case LLAMA_GRETYPE_CHAR:
10719
11902
  case LLAMA_GRETYPE_CHAR_NOT:
10720
- new_stacks.emplace_back(stack);
11903
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
11904
+ // only add the stack if it's not a duplicate of one we already have
11905
+ new_stacks.emplace_back(stack);
11906
+ }
10721
11907
  break;
10722
11908
  default:
10723
11909
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -10731,12 +11917,13 @@ static void llama_grammar_advance_stack(
10731
11917
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
10732
11918
  // produces the N possible stacks if the given char is accepted at those
10733
11919
  // positions
10734
- static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11920
+ void llama_grammar_accept(
10735
11921
  const std::vector<std::vector<llama_grammar_element>> & rules,
10736
11922
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
10737
- const uint32_t chr) {
11923
+ const uint32_t chr,
11924
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
10738
11925
 
10739
- std::vector<std::vector<const llama_grammar_element *>> new_stacks;
11926
+ new_stacks.clear();
10740
11927
 
10741
11928
  for (const auto & stack : stacks) {
10742
11929
  if (stack.empty()) {
@@ -10755,8 +11942,6 @@ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_acc
10755
11942
  llama_grammar_advance_stack(rules, new_stack, new_stacks);
10756
11943
  }
10757
11944
  }
10758
-
10759
- return new_stacks;
10760
11945
  }
10761
11946
 
10762
11947
  static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
@@ -10770,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
10770
11955
  const std::vector<llama_grammar_candidate> & candidates) {
10771
11956
 
10772
11957
  std::vector<llama_grammar_candidate> rejects;
11958
+ rejects.reserve(candidates.size());
10773
11959
 
10774
11960
  if (stack.empty()) {
10775
11961
  for (const auto & tok : candidates) {
@@ -10783,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
10783
11969
  const llama_grammar_element * stack_pos = stack.back();
10784
11970
 
10785
11971
  std::vector<llama_grammar_candidate> next_candidates;
11972
+ next_candidates.reserve(candidates.size());
11973
+
10786
11974
  for (const auto & tok : candidates) {
10787
11975
  if (*tok.code_points == 0) {
10788
11976
  // reached end of full codepoints in token, reject iff it ended in a partial sequence
@@ -11590,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
11590
12778
  // Note terminating 0 in decoded string
11591
12779
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
11592
12780
  const auto & code_points = decoded.first;
12781
+ std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
11593
12782
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
11594
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
12783
+ llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
12784
+ grammar->stacks = tmp_new_stacks;
11595
12785
  }
11596
12786
  grammar->partial_utf8 = decoded.second;
11597
12787
  GGML_ASSERT(!grammar->stacks.empty());
@@ -11957,7 +13147,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11957
13147
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
11958
13148
  // for getting the current layer as I initially thought, and we need to resort to parsing the
11959
13149
  // tensor name.
11960
- n_layer /= n_expert;
11961
13150
  if (sscanf(name, "blk.%d.", &i_layer) != 1) {
11962
13151
  throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
11963
13152
  }
@@ -11971,30 +13160,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11971
13160
  // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
11972
13161
  // with the quantization of the output tensor
11973
13162
  if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
11974
- int nx = tensor->ne[0];
11975
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
11976
- new_type = GGML_TYPE_Q8_0;
11977
- }
11978
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11979
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11980
- new_type = GGML_TYPE_Q5_K;
11981
- }
11982
- else if (new_type != GGML_TYPE_Q8_0) {
11983
- new_type = GGML_TYPE_Q6_K;
13163
+ if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
13164
+ new_type = qs.params->output_tensor_type;
13165
+ } else {
13166
+ int nx = tensor->ne[0];
13167
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
13168
+ new_type = GGML_TYPE_Q8_0;
13169
+ }
13170
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
13171
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
13172
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13173
+ new_type = GGML_TYPE_Q5_K;
13174
+ }
13175
+ else if (new_type != GGML_TYPE_Q8_0) {
13176
+ new_type = GGML_TYPE_Q6_K;
13177
+ }
11984
13178
  }
11985
13179
  } else if (name == "token_embd.weight") {
11986
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
11987
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11988
- new_type = GGML_TYPE_Q2_K;
11989
- }
11990
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11991
- new_type = GGML_TYPE_IQ3_S;
11992
- }
11993
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11994
- new_type = GGML_TYPE_IQ3_S;
13180
+ if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
13181
+ new_type = qs.params->token_embedding_type;
13182
+ } else {
13183
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
13184
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13185
+ new_type = GGML_TYPE_Q2_K;
13186
+ }
13187
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13188
+ new_type = GGML_TYPE_IQ3_S;
13189
+ }
13190
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
13191
+ new_type = GGML_TYPE_IQ3_S;
13192
+ }
11995
13193
  }
11996
13194
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
11997
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13195
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
11998
13196
  if (name.find("attn_v.weight") != std::string::npos) {
11999
13197
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
12000
13198
  else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -12013,7 +13211,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12013
13211
  if (qs.model.hparams.n_expert == 8) {
12014
13212
  new_type = GGML_TYPE_Q5_K;
12015
13213
  } else {
12016
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
13214
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
12017
13215
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
12018
13216
  }
12019
13217
  }
@@ -12027,13 +13225,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12027
13225
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
12028
13226
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
12029
13227
  }
12030
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
12031
- new_type = GGML_TYPE_Q4_K;
12032
- }
12033
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
12034
- new_type = GGML_TYPE_Q4_K;
12035
- }
12036
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
13228
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
12037
13229
  new_type = GGML_TYPE_Q4_K;
12038
13230
  }
12039
13231
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
@@ -12186,7 +13378,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12186
13378
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
12187
13379
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
12188
13380
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
12189
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
13381
+ new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
13382
+ new_type == GGML_TYPE_IQ1_M) {
12190
13383
  int nx = tensor->ne[0];
12191
13384
  int ny = tensor->ne[1];
12192
13385
  if (nx % QK_K != 0) {
@@ -12204,6 +13397,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12204
13397
  case GGML_TYPE_IQ3_XXS:
12205
13398
  case GGML_TYPE_IQ3_S:
12206
13399
  case GGML_TYPE_IQ1_S:
13400
+ case GGML_TYPE_IQ1_M:
12207
13401
  case GGML_TYPE_Q2_K:
12208
13402
  case GGML_TYPE_Q3_K:
12209
13403
  case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
@@ -12219,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12219
13413
  return new_type;
12220
13414
  }
12221
13415
 
12222
- static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
13416
+ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
12223
13417
  std::mutex mutex;
12224
- int counter = 0;
13418
+ int64_t counter = 0;
12225
13419
  size_t new_size = 0;
12226
13420
  if (nthread < 2) {
12227
13421
  // single-thread
@@ -12229,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
12229
13423
  }
12230
13424
  auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
12231
13425
  nrows, n_per_row, imatrix]() {
12232
- const int nrows_per_chunk = chunk_size / n_per_row;
13426
+ const int64_t nrows_per_chunk = chunk_size / n_per_row;
12233
13427
  size_t local_size = 0;
12234
13428
  while (true) {
12235
13429
  std::unique_lock<std::mutex> lock(mutex);
12236
- int first_row = counter; counter += nrows_per_chunk;
13430
+ int64_t first_row = counter; counter += nrows_per_chunk;
12237
13431
  if (first_row >= nrows) {
12238
13432
  if (local_size > 0) {
12239
13433
  new_size += local_size;
@@ -12241,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
12241
13435
  break;
12242
13436
  }
12243
13437
  lock.unlock();
12244
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
13438
+ const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
12245
13439
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
12246
13440
  }
12247
13441
  };
@@ -12285,6 +13479,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12285
13479
  case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
12286
13480
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
12287
13481
  case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
13482
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
12288
13483
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
12289
13484
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
12290
13485
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
@@ -12307,8 +13502,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12307
13502
  constexpr bool use_mmap = false;
12308
13503
  #endif
12309
13504
 
12310
- llama_model_loader ml(fname_inp, use_mmap, NULL);
12311
- ml.init_mapping(false); // no prefetching?
13505
+ llama_model_kv_override * kv_overrides = nullptr;
13506
+ if (params->kv_overrides) {
13507
+ auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
13508
+ kv_overrides = v->data();
13509
+ }
13510
+ llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
13511
+ ml.init_mappings(false); // no prefetching
12312
13512
 
12313
13513
  llama_model model;
12314
13514
  llm_load_arch(ml, model);
@@ -12332,36 +13532,48 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12332
13532
  struct gguf_context * ctx_out = gguf_init_empty();
12333
13533
 
12334
13534
  // copy the KV pairs from the input file
12335
- gguf_set_kv (ctx_out, ml.ctx_gguf);
13535
+ gguf_set_kv (ctx_out, ml.meta);
12336
13536
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
12337
13537
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
12338
13538
 
13539
+ if (params->kv_overrides) {
13540
+ const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
13541
+ for (auto & o : overrides) {
13542
+ if (o.key[0] == 0) break;
13543
+ if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
13544
+ gguf_set_val_f32(ctx_out, o.key, o.float_value);
13545
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
13546
+ gguf_set_val_i32(ctx_out, o.key, o.int_value);
13547
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
13548
+ gguf_set_val_bool(ctx_out, o.key, o.bool_value);
13549
+ } else {
13550
+ LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
13551
+ }
13552
+ }
13553
+ }
13554
+
12339
13555
  for (int i = 0; i < ml.n_tensors; ++i) {
12340
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13556
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12341
13557
 
12342
13558
  const std::string name = ggml_get_name(meta);
12343
13559
 
12344
13560
  // TODO: avoid hardcoded tensor names - use the TN_* constants
12345
- if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
13561
+ if (name.find("attn_v.weight") != std::string::npos ||
13562
+ name.find("attn_qkv.weight") != std::string::npos) {
12346
13563
  ++qs.n_attention_wv;
12347
- }
12348
- else if (name.find("ffn_down") != std::string::npos) {
12349
- ++qs.n_ffn_down;
12350
- }
12351
- else if (name.find("ffn_gate") != std::string::npos) {
12352
- ++qs.n_ffn_gate;
12353
- }
12354
- else if (name.find("ffn_up") != std::string::npos) {
12355
- ++qs.n_ffn_up;
12356
- }
12357
- else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13564
+ } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
12358
13565
  qs.has_output = true;
12359
13566
  }
12360
13567
  }
12361
- if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
12362
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
12363
- __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
12364
- }
13568
+
13569
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13570
+
13571
+ // sanity checks
13572
+ //
13573
+ // - qs.n_attention_wv == 0 for Mamba models
13574
+ // - qs.n_attention_wv == model.hparams.n_layer for Transformer models
13575
+ //
13576
+ GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
12365
13577
 
12366
13578
  size_t total_size_org = 0;
12367
13579
  size_t total_size_new = 0;
@@ -12377,7 +13589,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12377
13589
 
12378
13590
  // populate the original tensors so we get an initial meta data
12379
13591
  for (int i = 0; i < ml.n_tensors; ++i) {
12380
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13592
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12381
13593
  gguf_add_tensor(ctx_out, meta);
12382
13594
  }
12383
13595
 
@@ -12391,6 +13603,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12391
13603
  // placeholder for the meta data
12392
13604
  ::zeros(fout, meta_size);
12393
13605
 
13606
+ const auto tn = LLM_TN(model.arch);
13607
+
12394
13608
  for (int i = 0; i < ml.n_tensors; ++i) {
12395
13609
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
12396
13610
 
@@ -12413,8 +13627,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12413
13627
  // This used to be a regex, but <regex> has an extreme cost to compile times.
12414
13628
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
12415
13629
 
12416
- // quantize only 2D tensors
12417
- quantize &= (ggml_n_dims(tensor) == 2);
13630
+ // quantize only 2D and 3D tensors (experts)
13631
+ quantize &= (ggml_n_dims(tensor) >= 2);
13632
+
13633
+ // do not quantize norm tensors
13634
+ quantize &= name.find("_norm.weight") == std::string::npos;
13635
+
12418
13636
  quantize &= params->quantize_output_tensor || name != "output.weight";
12419
13637
  quantize &= !params->only_copy;
12420
13638
 
@@ -12443,6 +13661,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12443
13661
  if (!params->pure && ggml_is_quantized(default_type)) {
12444
13662
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
12445
13663
  }
13664
+ if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13665
+ new_type = params->token_embedding_type;
13666
+ }
13667
+ if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13668
+ new_type = params->output_tensor_type;
13669
+ }
12446
13670
 
12447
13671
  // If we've decided to quantize to the same type the tensor is already
12448
13672
  // in then there's nothing to do.
@@ -12455,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12455
13679
  new_size = ggml_nbytes(tensor);
12456
13680
  LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
12457
13681
  } else {
12458
- const size_t nelements = ggml_nelements(tensor);
13682
+ const int64_t nelements = ggml_nelements(tensor);
12459
13683
 
12460
13684
  const float * imatrix = nullptr;
12461
13685
  if (imatrix_data) {
@@ -12463,11 +13687,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12463
13687
  if (it == imatrix_data->end()) {
12464
13688
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
12465
13689
  } else {
12466
- if (it->second.size() == (size_t)tensor->ne[0]) {
13690
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
12467
13691
  imatrix = it->second.data();
12468
13692
  } else {
12469
13693
  LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
12470
- int(it->second.size()), int(tensor->ne[0]), tensor->name);
13694
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
13695
+
13696
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
13697
+ // this is a significant error and it may be good idea to abort the process if this happens,
13698
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
13699
+ // tok_embd should be ignored in this case, since it always causes this warning
13700
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
13701
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
13702
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
13703
+ }
12471
13704
  }
12472
13705
  }
12473
13706
  }
@@ -12475,6 +13708,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12475
13708
  new_type == GGML_TYPE_IQ2_XS ||
12476
13709
  new_type == GGML_TYPE_IQ2_S ||
12477
13710
  new_type == GGML_TYPE_IQ1_S ||
13711
+ (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
12478
13712
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
12479
13713
  LLAMA_LOG_ERROR("\n\n============================================================\n");
12480
13714
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -12497,21 +13731,30 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12497
13731
  LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
12498
13732
  fflush(stdout);
12499
13733
 
12500
- if (work.size() < nelements * 4) {
13734
+ if (work.size() < (size_t)nelements * 4) {
12501
13735
  work.resize(nelements * 4); // upper bound on size
12502
13736
  }
12503
13737
  new_data = work.data();
12504
13738
 
12505
- const int n_per_row = tensor->ne[0];
12506
- const int nrows = nelements / n_per_row;
13739
+ const int64_t n_per_row = tensor->ne[0];
13740
+ const int64_t nrows = tensor->ne[1];
13741
+
13742
+ static const int64_t min_chunk_size = 32 * 512;
13743
+ const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
12507
13744
 
12508
- static const int min_chunk_size = 32 * 512;
12509
- const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13745
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
13746
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
13747
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
12510
13748
 
12511
- const int nchunk = (nelements + chunk_size - 1)/chunk_size;
12512
- const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
12513
- new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
13749
+ // quantize each expert separately since they have different importance matrices
13750
+ new_size = 0;
13751
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
13752
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
13753
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
13754
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
12514
13755
 
13756
+ new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
13757
+ }
12515
13758
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
12516
13759
  }
12517
13760
  total_size_org += ggml_nbytes(tensor);
@@ -12582,7 +13825,7 @@ static int llama_apply_lora_from_file_internal(
12582
13825
  if (path_base_model) {
12583
13826
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
12584
13827
  ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
12585
- ml->init_mapping(/*prefetch*/ false); // no prefetching
13828
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
12586
13829
  }
12587
13830
 
12588
13831
  struct tensor_meta {
@@ -12703,7 +13946,7 @@ static int llama_apply_lora_from_file_internal(
12703
13946
 
12704
13947
  ggml_tensor * base_t;
12705
13948
  if (ml) {
12706
- if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
13949
+ if (!ml->get_tensor_meta(base_name.c_str())) {
12707
13950
  LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
12708
13951
  return 1;
12709
13952
  }
@@ -12887,11 +14130,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12887
14130
  struct llama_model_quantize_params result = {
12888
14131
  /*.nthread =*/ 0,
12889
14132
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
14133
+ /*.output_tensor_type =*/ GGML_TYPE_COUNT,
14134
+ /*.token_embedding_type =*/ GGML_TYPE_COUNT,
12890
14135
  /*.allow_requantize =*/ false,
12891
14136
  /*.quantize_output_tensor =*/ true,
12892
14137
  /*.only_copy =*/ false,
12893
14138
  /*.pure =*/ false,
12894
14139
  /*.imatrix =*/ nullptr,
14140
+ /*.kv_overrides =*/ nullptr,
12895
14141
  };
12896
14142
 
12897
14143
  return result;
@@ -12900,7 +14146,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12900
14146
  size_t llama_max_devices(void) {
12901
14147
  #if defined(GGML_USE_METAL)
12902
14148
  return 1;
12903
- #elif defined(GGML_USE_CUBLAS)
14149
+ #elif defined(GGML_USE_CUDA)
12904
14150
  return GGML_CUDA_MAX_DEVICES;
12905
14151
  #elif defined(GGML_USE_SYCL)
12906
14152
  return GGML_SYCL_MAX_DEVICES;
@@ -12920,8 +14166,8 @@ bool llama_supports_mlock(void) {
12920
14166
  }
12921
14167
 
12922
14168
  bool llama_supports_gpu_offload(void) {
12923
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
12924
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
14169
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
14170
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
12925
14171
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
12926
14172
  return true;
12927
14173
  #else
@@ -13028,7 +14274,7 @@ struct llama_context * llama_new_context_with_model(
13028
14274
  const auto & hparams = model->hparams;
13029
14275
  auto & cparams = ctx->cparams;
13030
14276
 
13031
- // TODO: maybe add n_seq_max here too
14277
+ cparams.n_seq_max = std::max(1u, params.n_seq_max);
13032
14278
  cparams.n_threads = params.n_threads;
13033
14279
  cparams.n_threads_batch = params.n_threads_batch;
13034
14280
  cparams.yarn_ext_factor = params.yarn_ext_factor;
@@ -13126,7 +14372,7 @@ struct llama_context * llama_new_context_with_model(
13126
14372
  }
13127
14373
  ctx->backends.push_back(ctx->backend_metal);
13128
14374
  }
13129
- #elif defined(GGML_USE_CUBLAS)
14375
+ #elif defined(GGML_USE_CUDA)
13130
14376
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
13131
14377
  // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
13132
14378
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
@@ -13149,7 +14395,20 @@ struct llama_context * llama_new_context_with_model(
13149
14395
  }
13150
14396
  }
13151
14397
  #elif defined(GGML_USE_VULKAN)
13152
- if (model->n_gpu_layers > 0) {
14398
+ if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14399
+ LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
14400
+ llama_free(ctx);
14401
+ return nullptr;
14402
+ }
14403
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
14404
+ ggml_backend_t backend = ggml_backend_vk_init(0);
14405
+ if (backend == nullptr) {
14406
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
14407
+ llama_free(ctx);
14408
+ return nullptr;
14409
+ }
14410
+ ctx->backends.push_back(backend);
14411
+ } else {
13153
14412
  for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
13154
14413
  ggml_backend_t backend = ggml_backend_vk_init(device);
13155
14414
  if (backend == nullptr) {
@@ -13161,30 +14420,28 @@ struct llama_context * llama_new_context_with_model(
13161
14420
  }
13162
14421
  }
13163
14422
  #elif defined(GGML_USE_SYCL)
13164
- if (model->n_gpu_layers > 0) {
13165
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
13166
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
13167
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
14423
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
14424
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14425
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
14426
+ if (backend == nullptr) {
14427
+ int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
14428
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
14429
+ llama_free(ctx);
14430
+ return nullptr;
14431
+ }
14432
+ ctx->backends.push_back(backend);
14433
+ } else {
14434
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
14435
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
14436
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
13168
14437
  if (backend == nullptr) {
13169
- int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
13170
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
14438
+ int id_list[GGML_SYCL_MAX_DEVICES];
14439
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
14440
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
13171
14441
  llama_free(ctx);
13172
14442
  return nullptr;
13173
14443
  }
13174
14444
  ctx->backends.push_back(backend);
13175
- } else {
13176
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
13177
- for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
13178
- ggml_backend_t backend = ggml_backend_sycl_init(i);
13179
- if (backend == nullptr) {
13180
- int id_list[GGML_SYCL_MAX_DEVICES];
13181
- ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
13182
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
13183
- llama_free(ctx);
13184
- return nullptr;
13185
- }
13186
- ctx->backends.push_back(backend);
13187
- }
13188
14445
  }
13189
14446
  }
13190
14447
  #elif defined(GGML_USE_KOMPUTE)
@@ -13232,25 +14489,12 @@ struct llama_context * llama_new_context_with_model(
13232
14489
 
13233
14490
  // graph outputs buffer
13234
14491
  {
13235
- // resized during inference, reserve maximum
13236
- ctx->logits_size = hparams.n_vocab*cparams.n_batch;
13237
- ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
13238
-
13239
- const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
13240
-
13241
- ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
13242
- if (ctx->buf_output == nullptr) {
13243
- LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
14492
+ // resized during inference when a batch uses more outputs
14493
+ if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
14494
+ LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
13244
14495
  llama_free(ctx);
13245
14496
  return nullptr;
13246
14497
  }
13247
- ggml_backend_buffer_clear(ctx->buf_output, 0);
13248
-
13249
-
13250
- ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
13251
- if (params.embeddings) {
13252
- ctx->embd = ctx->logits + ctx->logits_size;
13253
- }
13254
14498
 
13255
14499
  LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
13256
14500
  ggml_backend_buffer_name(ctx->buf_output),
@@ -13275,7 +14519,7 @@ struct llama_context * llama_new_context_with_model(
13275
14519
 
13276
14520
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
13277
14521
  bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
13278
- #ifndef GGML_USE_CUBLAS
14522
+ #ifndef GGML_USE_CUDA
13279
14523
  // pipeline parallelism requires support for async compute and events
13280
14524
  // currently this is only implemented in the CUDA backend
13281
14525
  pipeline_parallel = false;
@@ -13383,11 +14627,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
13383
14627
  case LLM_ARCH_ORION:
13384
14628
  case LLM_ARCH_INTERNLM2:
13385
14629
  case LLM_ARCH_MINICPM:
14630
+ case LLM_ARCH_XVERSE:
13386
14631
  case LLM_ARCH_COMMAND_R:
13387
14632
  return LLAMA_ROPE_TYPE_NORM;
13388
14633
 
13389
14634
  // the pairs of head values are offset by n_rot/2
13390
14635
  case LLM_ARCH_FALCON:
14636
+ case LLM_ARCH_GROK:
13391
14637
  case LLM_ARCH_PERSIMMON:
13392
14638
  case LLM_ARCH_BERT:
13393
14639
  case LLM_ARCH_NOMIC_BERT:
@@ -13763,30 +15009,60 @@ void llama_kv_cache_update(struct llama_context * ctx) {
13763
15009
  llama_kv_cache_update_internal(*ctx);
13764
15010
  }
13765
15011
 
15012
+ // deprecated
15013
+ size_t llama_get_state_size(const struct llama_context * ctx) {
15014
+ return llama_state_get_size(ctx);
15015
+ }
15016
+
15017
+ // deprecated
15018
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
15019
+ return llama_state_get_data(ctx, dst);
15020
+ }
15021
+
15022
+ // deprecated
15023
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15024
+ return llama_state_set_data(ctx, src);
15025
+ }
15026
+
15027
+ // deprecated
15028
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15029
+ return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15030
+ }
15031
+
15032
+ // deprecated
15033
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15034
+ return llama_state_save_file(ctx, path_session, tokens, n_token_count);
15035
+ }
13766
15036
 
13767
15037
  // Returns the *maximum* size of the state
13768
- size_t llama_get_state_size(const struct llama_context * ctx) {
15038
+ size_t llama_state_get_size(const struct llama_context * ctx) {
15039
+ const auto & cparams = ctx->cparams;
15040
+ const auto & hparams = ctx->model.hparams;
15041
+
13769
15042
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
13770
15043
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
13771
15044
  const size_t s_rng_size = sizeof(size_t);
13772
15045
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
15046
+ const size_t s_n_outputs = sizeof(size_t);
15047
+ // assume worst case for outputs although only currently set ones are serialized
15048
+ const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
13773
15049
  const size_t s_logits_size = sizeof(size_t);
13774
- // assume worst case for logits although only currently set ones are serialized
13775
- const size_t s_logits = ctx->logits_size * sizeof(float);
15050
+ const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
13776
15051
  const size_t s_embedding_size = sizeof(size_t);
13777
- const size_t s_embedding = ctx->embd_size * sizeof(float);
15052
+ const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
13778
15053
  const size_t s_kv_buf_size = sizeof(size_t);
13779
15054
  const size_t s_kv_head = sizeof(uint32_t);
13780
15055
  const size_t s_kv_size = sizeof(uint32_t);
13781
15056
  const size_t s_kv_used = sizeof(uint32_t);
13782
15057
  const size_t s_kv = ctx->kv_self.total_size();
13783
- // TODO: assume the max is more than 1 seq_id per KV cell
13784
- const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
15058
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
13785
15059
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
13786
15060
 
13787
15061
  const size_t s_total = (
13788
15062
  + s_rng_size
13789
15063
  + s_rng
15064
+ + s_n_outputs
15065
+ + s_output_pos
13790
15066
  + s_logits_size
13791
15067
  + s_logits
13792
15068
  + s_embedding_size
@@ -13847,21 +15123,21 @@ struct llama_data_file_context : llama_data_context {
13847
15123
  * file context:
13848
15124
  * llama_file file("/path", "wb");
13849
15125
  * llama_data_file_context data_ctx(&file);
13850
- * llama_copy_state_data(ctx, &data_ctx);
15126
+ * llama_state_get_data(ctx, &data_ctx);
13851
15127
  *
13852
15128
  * buffer context:
13853
15129
  * std::vector<uint8_t> buf(max_size, 0);
13854
15130
  * llama_data_buffer_context data_ctx(&buf.data());
13855
- * llama_copy_state_data(ctx, &data_ctx);
15131
+ * llama_state_get_data(ctx, &data_ctx);
13856
15132
  *
13857
15133
  */
13858
- static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15134
+ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
13859
15135
  // copy rng
13860
15136
  {
13861
15137
  std::ostringstream rng_ss;
13862
15138
  rng_ss << ctx->rng;
13863
15139
 
13864
- const std::string & rng_str = rng_ss.str();
15140
+ const std::string & rng_str = rng_ss.str();
13865
15141
  const size_t rng_size = rng_str.size();
13866
15142
 
13867
15143
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
@@ -13870,25 +15146,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13870
15146
  data_ctx->write(rng_str.data(), rng_size);
13871
15147
  }
13872
15148
 
13873
- // copy logits
15149
+ // copy outputs
13874
15150
  {
13875
- const size_t logits_size = ctx->logits_size;
15151
+ // Can't use ctx->n_outputs because it's not for the
15152
+ // entire last batch when n_ubatch is smaller than n_batch
15153
+ size_t n_outputs = 0;
15154
+
15155
+ // copy output ids
15156
+ {
15157
+ std::vector<int32_t> output_pos;
13876
15158
 
13877
- data_ctx->write(&logits_size, sizeof(logits_size));
15159
+ const size_t n_batch = ctx->cparams.n_batch;
15160
+ const auto & output_ids = ctx->output_ids;
13878
15161
 
13879
- if (logits_size) {
13880
- data_ctx->write(ctx->logits, logits_size * sizeof(float));
15162
+ output_pos.resize(ctx->output_size);
15163
+
15164
+ // build a more compact representation of the output ids
15165
+ for (size_t i = 0; i < n_batch; ++i) {
15166
+ // map an output id to a position in the batch
15167
+ int32_t pos = output_ids[i];
15168
+ if (pos >= 0) {
15169
+ if ((size_t) pos >= n_outputs) {
15170
+ n_outputs = pos + 1;
15171
+ }
15172
+ GGML_ASSERT((size_t) pos < ctx->output_size);
15173
+ output_pos[pos] = i;
15174
+ }
15175
+ }
15176
+
15177
+ data_ctx->write(&n_outputs, sizeof(n_outputs));
15178
+
15179
+ if (n_outputs) {
15180
+ data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
15181
+ }
13881
15182
  }
13882
- }
13883
15183
 
13884
- // copy embeddings
13885
- {
13886
- const size_t embeddings_size = ctx->embd_size;
15184
+ // copy logits
15185
+ {
15186
+ const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
13887
15187
 
13888
- data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15188
+ data_ctx->write(&logits_size, sizeof(logits_size));
13889
15189
 
13890
- if (embeddings_size) {
13891
- data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15190
+ if (logits_size) {
15191
+ data_ctx->write(ctx->logits, logits_size * sizeof(float));
15192
+ }
15193
+ }
15194
+
15195
+ // copy embeddings
15196
+ {
15197
+ const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
15198
+
15199
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15200
+
15201
+ if (embeddings_size) {
15202
+ data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15203
+ }
13892
15204
  }
13893
15205
  }
13894
15206
 
@@ -13901,9 +15213,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13901
15213
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13902
15214
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
13903
15215
 
13904
- const size_t kv_buf_size = kv_self.total_size();
15216
+ // NOTE: kv_size and kv_buf_size are mostly used for sanity checks
13905
15217
  const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
13906
15218
  const uint32_t kv_size = kv_self.size;
15219
+ const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
13907
15220
  const uint32_t kv_used = kv_self.used;
13908
15221
 
13909
15222
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
@@ -13912,6 +15225,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13912
15225
  data_ctx->write(&kv_used, sizeof(kv_used));
13913
15226
 
13914
15227
  if (kv_buf_size) {
15228
+ const size_t pre_kv_buf_size = data_ctx->get_size_written();
15229
+
13915
15230
  std::vector<uint8_t> tmp_buf;
13916
15231
  for (int il = 0; il < (int) n_layer; ++il) {
13917
15232
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -13941,6 +15256,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13941
15256
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
13942
15257
  }
13943
15258
  }
15259
+ GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
13944
15260
  }
13945
15261
 
13946
15262
  for (uint32_t i = 0; i < kv_head; ++i) {
@@ -13959,15 +15275,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13959
15275
  }
13960
15276
  }
13961
15277
 
13962
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
15278
+ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
13963
15279
  llama_data_buffer_context data_ctx(dst);
13964
- llama_copy_state_data_internal(ctx, &data_ctx);
15280
+ llama_state_get_data_internal(ctx, &data_ctx);
13965
15281
 
13966
15282
  return data_ctx.get_size_written();
13967
15283
  }
13968
15284
 
13969
15285
  // Sets the state reading from the specified source address
13970
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15286
+ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
13971
15287
  const uint8_t * inp = src;
13972
15288
 
13973
15289
  // set rng
@@ -13985,6 +15301,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13985
15301
  GGML_ASSERT(!rng_ss.fail());
13986
15302
  }
13987
15303
 
15304
+ // set output ids
15305
+ {
15306
+ size_t n_outputs;
15307
+ std::vector<int32_t> output_pos;
15308
+
15309
+ memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
15310
+
15311
+ GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
15312
+
15313
+ if (n_outputs) {
15314
+ output_pos.resize(n_outputs);
15315
+ memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
15316
+ inp += n_outputs * sizeof(int32_t);
15317
+
15318
+ for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
15319
+ int32_t id = output_pos[i];
15320
+ GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15321
+ ctx->output_ids[id] = i;
15322
+ }
15323
+ }
15324
+ }
15325
+
13988
15326
  // set logits
13989
15327
  {
13990
15328
  size_t logits_size;
@@ -14005,7 +15343,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14005
15343
 
14006
15344
  memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
14007
15345
 
14008
- GGML_ASSERT(ctx->embd_size == embeddings_size);
15346
+ GGML_ASSERT(ctx->embd_size >= embeddings_size);
14009
15347
 
14010
15348
  if (embeddings_size) {
14011
15349
  memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
@@ -14032,8 +15370,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14032
15370
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
14033
15371
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
14034
15372
 
15373
+ if (kv_self.size != kv_size) {
15374
+ // the KV cache needs to be big enough to load all the KV cells from the saved state
15375
+ GGML_ASSERT(kv_self.size >= kv_head);
15376
+
15377
+ LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
15378
+ __func__, kv_head, kv_size, kv_self.size);
15379
+ }
15380
+
14035
15381
  if (kv_buf_size) {
14036
- GGML_ASSERT(kv_self.total_size() == kv_buf_size);
15382
+ const size_t pre_kv_buf_size = inp - src;
15383
+
15384
+ GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
14037
15385
 
14038
15386
  for (int il = 0; il < (int) n_layer; ++il) {
14039
15387
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -14053,23 +15401,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14053
15401
 
14054
15402
  // v is not contiguous, copy row by row
14055
15403
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
14056
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
15404
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
14057
15405
 
14058
15406
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
14059
15407
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
14060
15408
  inp += v_row_size;
14061
15409
  }
14062
15410
  }
15411
+ GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
14063
15412
  }
14064
15413
 
14065
- GGML_ASSERT(kv_self.size == kv_size);
15414
+ llama_kv_cache_clear(ctx);
14066
15415
 
14067
15416
  ctx->kv_self.head = kv_head;
14068
- ctx->kv_self.size = kv_size;
14069
15417
  ctx->kv_self.used = kv_used;
14070
15418
 
14071
- ctx->kv_self.cells.resize(kv_size);
14072
-
14073
15419
  for (uint32_t i = 0; i < kv_head; ++i) {
14074
15420
  llama_pos pos;
14075
15421
  size_t seq_id_size;
@@ -14086,22 +15432,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14086
15432
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
14087
15433
  }
14088
15434
  }
14089
-
14090
- for (uint32_t i = kv_head; i < kv_size; ++i) {
14091
- ctx->kv_self.cells[i].pos = -1;
14092
- ctx->kv_self.cells[i].seq_id.clear();
14093
- }
14094
15435
  }
14095
15436
 
14096
15437
  const size_t nread = inp - src;
14097
- const size_t max_size = llama_get_state_size(ctx);
15438
+ const size_t max_size = llama_state_get_size(ctx);
14098
15439
 
14099
15440
  GGML_ASSERT(nread <= max_size);
14100
15441
 
14101
15442
  return nread;
14102
15443
  }
14103
15444
 
14104
- static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15445
+ static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
14105
15446
  llama_file file(path_session, "rb");
14106
15447
 
14107
15448
  // sanity checks
@@ -14139,7 +15480,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
14139
15480
  // restore the context state
14140
15481
  {
14141
15482
  const size_t n_state_size_cur = file.size - file.tell();
14142
- const size_t n_state_size_max = llama_get_state_size(ctx);
15483
+ const size_t n_state_size_max = llama_state_get_size(ctx);
14143
15484
 
14144
15485
  if (n_state_size_cur > n_state_size_max) {
14145
15486
  LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
@@ -14149,22 +15490,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
14149
15490
  std::vector<uint8_t> state_data(n_state_size_max);
14150
15491
  file.read_raw(state_data.data(), n_state_size_cur);
14151
15492
 
14152
- llama_set_state_data(ctx, state_data.data());
15493
+ llama_state_set_data(ctx, state_data.data());
14153
15494
  }
14154
15495
 
14155
15496
  return true;
14156
15497
  }
14157
15498
 
14158
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15499
+ bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
14159
15500
  try {
14160
- return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15501
+ return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
14161
15502
  } catch (const std::exception & err) {
14162
15503
  LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
14163
15504
  return false;
14164
15505
  }
14165
15506
  }
14166
15507
 
14167
- bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15508
+ static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
14168
15509
  llama_file file(path_session, "wb");
14169
15510
 
14170
15511
  file.write_u32(LLAMA_SESSION_MAGIC);
@@ -14178,11 +15519,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
14178
15519
 
14179
15520
  // save the context state using stream saving
14180
15521
  llama_data_file_context data_ctx(&file);
14181
- llama_copy_state_data_internal(ctx, &data_ctx);
15522
+ llama_state_get_data_internal(ctx, &data_ctx);
14182
15523
 
14183
15524
  return true;
14184
15525
  }
14185
15526
 
15527
+ bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15528
+ try {
15529
+ return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
15530
+ } catch (const std::exception & err) {
15531
+ LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
15532
+ return false;
15533
+ }
15534
+ }
15535
+
15536
+ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
15537
+ // save the size of size_t as a uint32_t for safety check
15538
+ const size_t size_t_size_size = sizeof(uint32_t);
15539
+
15540
+ // other values
15541
+ const size_t s_cell_count_size = sizeof(uint32_t);
15542
+ const size_t s_layer_count_size = sizeof(uint32_t);
15543
+ const size_t n_embd_v_gqa_size = sizeof(uint32_t);
15544
+
15545
+ size_t s_cell_count = 0;
15546
+ size_t s_cell_data_size = 0;
15547
+ const auto & kv_self = ctx->kv_self;
15548
+ const auto & hparams = ctx->model.hparams;
15549
+
15550
+ const uint32_t n_layer = hparams.n_layer;
15551
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15552
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15553
+
15554
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
15555
+ const auto & cell = kv_self.cells[i];
15556
+ if (cell.seq_id.count(seq_id) > 0) {
15557
+ ++s_cell_count;
15558
+ s_cell_data_size += sizeof(llama_pos);
15559
+ }
15560
+ }
15561
+
15562
+ for (int il = 0; il < (int)n_layer; ++il) {
15563
+ // types of keys and values
15564
+ s_cell_data_size += sizeof(int32_t) * 2;
15565
+ // k_size_row and v_size_el values of layer
15566
+ s_cell_data_size += sizeof(size_t) * 2;
15567
+
15568
+ // keys
15569
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15570
+ s_cell_data_size += k_size_row * s_cell_count;
15571
+
15572
+ // values (transposed)
15573
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15574
+ s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
15575
+ }
15576
+
15577
+ const size_t s_total = (
15578
+ size_t_size_size +
15579
+ s_cell_count_size +
15580
+ s_layer_count_size +
15581
+ n_embd_v_gqa_size +
15582
+ s_cell_data_size
15583
+ );
15584
+
15585
+ return s_total;
15586
+ }
15587
+
15588
+ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
15589
+ const auto & kv_self = ctx->kv_self;
15590
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
15591
+
15592
+ // Save the size of size_t as a uint32_t for safety check
15593
+ const uint32_t size_t_size = sizeof(size_t);
15594
+ data_ctx.write(&size_t_size, sizeof(size_t_size));
15595
+
15596
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
15597
+ uint32_t cell_count = 0;
15598
+
15599
+ // Count the number of cells with the specified seq_id
15600
+ // Find all the ranges of cells with this seq id
15601
+ {
15602
+ uint32_t cell_range_begin = kv_self.size;
15603
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
15604
+ const auto & cell = kv_self.cells[i];
15605
+ if (cell.has_seq_id(seq_id)) {
15606
+ ++cell_count;
15607
+ if (cell_range_begin == kv_self.size) {
15608
+ cell_range_begin = i;
15609
+ }
15610
+ }
15611
+ else {
15612
+ if (cell_range_begin != kv_self.size) {
15613
+ cell_ranges.push_back({ cell_range_begin, i });
15614
+ cell_range_begin = kv_self.size;
15615
+ }
15616
+ }
15617
+ }
15618
+ if (cell_range_begin != kv_self.size) {
15619
+ cell_ranges.push_back({ cell_range_begin, kv_self.size });
15620
+ }
15621
+
15622
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
15623
+ uint32_t cell_count_check = 0;
15624
+ for (const auto & range : cell_ranges) {
15625
+ cell_count_check += range.second - range.first;
15626
+ }
15627
+ GGML_ASSERT(cell_count == cell_count_check);
15628
+ }
15629
+
15630
+ // Write the cell count
15631
+ data_ctx.write(&cell_count, sizeof(cell_count));
15632
+
15633
+ const auto & hparams = ctx->model.hparams;
15634
+ const uint32_t n_layer = hparams.n_layer;
15635
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15636
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15637
+
15638
+ // Write the layer count
15639
+ data_ctx.write(&n_layer, sizeof(n_layer));
15640
+
15641
+ // Write n_embd_v_gqa
15642
+ data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
15643
+
15644
+ // Iterate the ranges and write all the pos (this is the token position in the prompt)
15645
+ for (const auto & range : cell_ranges) {
15646
+ for (uint32_t i = range.first; i < range.second; ++i) {
15647
+ const auto & cell = kv_self.cells[i];
15648
+ data_ctx.write(&cell.pos, sizeof(cell.pos));
15649
+ }
15650
+ }
15651
+
15652
+ // Iterate and write all the keys first, each row is a cell
15653
+ // Get whole range at a time
15654
+ std::vector<uint8_t> tmp_buf;
15655
+ for (int il = 0; il < (int)n_layer; ++il) {
15656
+ // Write key type
15657
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
15658
+ data_ctx.write(&k_type_i, sizeof(k_type_i));
15659
+
15660
+ // Write row size of key
15661
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15662
+ data_ctx.write(&k_size_row, sizeof(k_size_row));
15663
+
15664
+ // Read each range of cells of k_size length each into tmp_buf and write out
15665
+ for (const auto & range : cell_ranges) {
15666
+ const size_t range_size = range.second - range.first;
15667
+ tmp_buf.resize(range_size * k_size_row);
15668
+ ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
15669
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
15670
+ }
15671
+ }
15672
+
15673
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
15674
+ const uint32_t kv_size = kv_self.size;
15675
+ for (int il = 0; il < (int)n_layer; ++il) {
15676
+ // Write value type
15677
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
15678
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
15679
+
15680
+ // Write element size
15681
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15682
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
15683
+
15684
+ // For each row, we get the element values of each cell
15685
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
15686
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
15687
+ for (const auto & range : cell_ranges) {
15688
+ const size_t range_size = range.second - range.first;
15689
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
15690
+ tmp_buf.resize(range_size * v_size_el);
15691
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
15692
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
15693
+ }
15694
+ }
15695
+ }
15696
+
15697
+ return data_ctx.get_size_written();
15698
+ }
15699
+
15700
+ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
15701
+ llama_data_buffer_context data_ctx(dst);
15702
+ return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
15703
+ }
15704
+
15705
+ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
15706
+ auto & kv_self = ctx->kv_self;
15707
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
15708
+
15709
+ // Wipe the slot
15710
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15711
+
15712
+ const uint8_t * inp = src;
15713
+
15714
+ // Read size of size_t
15715
+ uint32_t size_t_size;
15716
+ memcpy(&size_t_size, inp, sizeof(size_t_size));
15717
+ inp += sizeof(size_t_size);
15718
+ if (size_t_size != sizeof(size_t)) {
15719
+ LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
15720
+ return 0;
15721
+ }
15722
+
15723
+ // Read the cell count
15724
+ uint32_t cell_count;
15725
+ memcpy(&cell_count, inp, sizeof(cell_count));
15726
+ inp += sizeof(cell_count);
15727
+
15728
+ // Read the layer count
15729
+ uint32_t n_layer_ref;
15730
+ memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
15731
+ inp += sizeof(n_layer_ref);
15732
+
15733
+ // Read n_embd_v_gqa
15734
+ uint32_t n_embd_v_gqa_ref;
15735
+ memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
15736
+ inp += sizeof(n_embd_v_gqa_ref);
15737
+
15738
+ // Sanity check model compatibility
15739
+ const auto & hparams = ctx->model.hparams;
15740
+ const uint32_t n_layer = hparams.n_layer;
15741
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
15742
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
15743
+ if (n_layer != n_layer_ref) {
15744
+ LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
15745
+ return 0;
15746
+ }
15747
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
15748
+ LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
15749
+ return 0;
15750
+ }
15751
+
15752
+ // Allocate the new cells for the slot
15753
+ if (cell_count) {
15754
+ llama_batch batch = llama_batch_init(cell_count, 0, 1);
15755
+ batch.n_tokens = cell_count;
15756
+ for (uint32_t i = 0; i < cell_count; ++i) {
15757
+ llama_pos pos;
15758
+ memcpy(&pos, inp, sizeof(pos));
15759
+ inp += sizeof(pos);
15760
+
15761
+ batch.pos[i] = pos;
15762
+ batch.n_seq_id[i] = 1;
15763
+ batch.seq_id[i][0] = dest_seq_id;
15764
+ }
15765
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
15766
+ llama_batch_free(batch);
15767
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
15768
+ return 0;
15769
+ }
15770
+
15771
+ // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
15772
+ // Assume that this is one contiguous block of cells
15773
+ GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
15774
+ GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
15775
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
15776
+ GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
15777
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
15778
+
15779
+ // Cleanup
15780
+ llama_batch_free(batch);
15781
+ }
15782
+
15783
+ const uint32_t kv_size = kv_self.size;
15784
+ const uint32_t kv_head = kv_self.head;
15785
+
15786
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
15787
+ for (int il = 0; il < (int)n_layer; ++il) {
15788
+ // Read type of key
15789
+ int32_t k_type_i_ref;
15790
+ memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
15791
+ inp += sizeof(k_type_i_ref);
15792
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
15793
+ if (k_type_i != k_type_i_ref) {
15794
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15795
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
15796
+ return 0;
15797
+ }
15798
+
15799
+ // Read row size of key
15800
+ size_t k_size_row_ref;
15801
+ memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
15802
+ inp += sizeof(k_size_row_ref);
15803
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
15804
+ if (k_size_row != k_size_row_ref) {
15805
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15806
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
15807
+ return 0;
15808
+ }
15809
+
15810
+ if (cell_count) {
15811
+ // Read and set the keys for the whole cell range
15812
+ ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
15813
+ inp += cell_count * k_size_row;
15814
+ }
15815
+ }
15816
+
15817
+ // For each layer, read the values for each cell (transposed)
15818
+ for (int il = 0; il < (int)n_layer; ++il) {
15819
+ // Read type of value
15820
+ int32_t v_type_i_ref;
15821
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
15822
+ inp += sizeof(v_type_i_ref);
15823
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
15824
+ if (v_type_i != v_type_i_ref) {
15825
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15826
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
15827
+ return 0;
15828
+ }
15829
+
15830
+ // Read element size of value
15831
+ size_t v_size_el_ref;
15832
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
15833
+ inp += sizeof(v_size_el_ref);
15834
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
15835
+ if (v_size_el != v_size_el_ref) {
15836
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
15837
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
15838
+ return 0;
15839
+ }
15840
+
15841
+ if (cell_count) {
15842
+ // For each row in the transposed matrix, read the values for the whole cell range
15843
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
15844
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
15845
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
15846
+ inp += cell_count * v_size_el;
15847
+ }
15848
+ }
15849
+ }
15850
+
15851
+ const size_t nread = inp - src;
15852
+ return nread;
15853
+ }
15854
+
15855
+ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
15856
+ llama_file file(filepath, "wb");
15857
+
15858
+ file.write_u32(LLAMA_STATE_SEQ_MAGIC);
15859
+ file.write_u32(LLAMA_STATE_SEQ_VERSION);
15860
+
15861
+ // save the prompt
15862
+ file.write_u32((uint32_t)n_token_count);
15863
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
15864
+
15865
+ // save the context state using stream saving
15866
+ llama_data_file_context data_ctx(&file);
15867
+ llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
15868
+
15869
+ const size_t res = file.tell();
15870
+ GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
15871
+ return res;
15872
+ }
15873
+
15874
+ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15875
+ llama_file file(filepath, "rb");
15876
+
15877
+ // version checks
15878
+ {
15879
+ const uint32_t magic = file.read_u32();
15880
+ const uint32_t version = file.read_u32();
15881
+
15882
+ if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
15883
+ LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
15884
+ return 0;
15885
+ }
15886
+ }
15887
+
15888
+ // load the prompt
15889
+ {
15890
+ const uint32_t n_token_count = file.read_u32();
15891
+
15892
+ if (n_token_count > n_token_capacity) {
15893
+ LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
15894
+ return 0;
15895
+ }
15896
+
15897
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
15898
+ *n_token_count_out = n_token_count;
15899
+ }
15900
+
15901
+ // restore the context state
15902
+ {
15903
+ const size_t state_size = file.size - file.tell();
15904
+ std::vector<uint8_t> state_data(state_size);
15905
+ file.read_raw(state_data.data(), state_size);
15906
+ const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
15907
+ if (!nread) {
15908
+ LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
15909
+ return 0;
15910
+ }
15911
+ GGML_ASSERT(nread <= state_size);
15912
+ GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
15913
+ }
15914
+
15915
+ return file.tell();
15916
+ }
15917
+
15918
+ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
15919
+ try {
15920
+ return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
15921
+ } catch (const std::exception & err) {
15922
+ LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
15923
+ return 0;
15924
+ }
15925
+ }
15926
+
15927
+ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15928
+ try {
15929
+ return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
15930
+ } catch (const std::exception & err) {
15931
+ LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
15932
+ return 0;
15933
+ }
15934
+ }
15935
+
14186
15936
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
14187
15937
  ctx->cparams.n_threads = n_threads;
14188
15938
  ctx->cparams.n_threads_batch = n_threads_batch;
@@ -14296,11 +16046,41 @@ float * llama_get_logits(struct llama_context * ctx) {
14296
16046
  }
14297
16047
 
14298
16048
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
14299
- assert(ctx->logits_valid.at(i));
14300
-
16049
+ int32_t j = -1;
14301
16050
  llama_synchronize(ctx);
14302
16051
 
14303
- return ctx->logits + i*ctx->model.hparams.n_vocab;
16052
+ try {
16053
+ if (ctx->logits == nullptr) {
16054
+ throw std::runtime_error("no logits");
16055
+ }
16056
+
16057
+ if (i < 0) {
16058
+ j = ctx->n_outputs + i;
16059
+ if (j < 0) {
16060
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16061
+ }
16062
+ } else if ((size_t) i >= ctx->output_ids.size()) {
16063
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16064
+ } else {
16065
+ j = ctx->output_ids[i];
16066
+ }
16067
+
16068
+ if (j < 0) {
16069
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
16070
+ }
16071
+ if (j >= ctx->n_outputs) {
16072
+ // This should not happen
16073
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
16074
+ }
16075
+
16076
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
16077
+ } catch (const std::exception & err) {
16078
+ LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
16079
+ #ifndef NDEBUG
16080
+ GGML_ASSERT(false);
16081
+ #endif
16082
+ return nullptr;
16083
+ }
14304
16084
  }
14305
16085
 
14306
16086
  float * llama_get_embeddings(struct llama_context * ctx) {
@@ -14310,9 +16090,42 @@ float * llama_get_embeddings(struct llama_context * ctx) {
14310
16090
  }
14311
16091
 
14312
16092
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
16093
+ int32_t j = -1;
16094
+
14313
16095
  llama_synchronize(ctx);
14314
16096
 
14315
- return ctx->embd + i*ctx->model.hparams.n_embd;
16097
+ try {
16098
+ if (ctx->embd == nullptr) {
16099
+ throw std::runtime_error("no embeddings");
16100
+ }
16101
+
16102
+ if (i < 0) {
16103
+ j = ctx->n_outputs + i;
16104
+ if (j < 0) {
16105
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16106
+ }
16107
+ } else if ((size_t) i >= ctx->output_ids.size()) {
16108
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16109
+ } else {
16110
+ j = ctx->output_ids[i];
16111
+ }
16112
+
16113
+ if (j < 0) {
16114
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
16115
+ }
16116
+ if (j >= ctx->n_outputs) {
16117
+ // This should not happen
16118
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
16119
+ }
16120
+
16121
+ return ctx->embd + j*ctx->model.hparams.n_embd;
16122
+ } catch (const std::exception & err) {
16123
+ LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
16124
+ #ifndef NDEBUG
16125
+ GGML_ASSERT(false);
16126
+ #endif
16127
+ return nullptr;
16128
+ }
14316
16129
  }
14317
16130
 
14318
16131
  float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
@@ -14349,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
14349
16162
  return model->vocab.special_eos_id;
14350
16163
  }
14351
16164
 
16165
+ llama_token llama_token_cls(const struct llama_model * model) {
16166
+ return model->vocab.special_cls_id;
16167
+ }
16168
+
16169
+ llama_token llama_token_sep(const struct llama_model * model) {
16170
+ return model->vocab.special_sep_id;
16171
+ }
16172
+
14352
16173
  llama_token llama_token_nl(const struct llama_model * model) {
14353
16174
  return model->vocab.linefeed_id;
14354
16175
  }
@@ -14383,9 +16204,9 @@ int32_t llama_tokenize(
14383
16204
  int32_t text_len,
14384
16205
  llama_token * tokens,
14385
16206
  int32_t n_tokens_max,
14386
- bool add_bos,
14387
- bool special) {
14388
- auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
16207
+ bool add_special,
16208
+ bool parse_special) {
16209
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
14389
16210
 
14390
16211
  if (n_tokens_max < (int) res.size()) {
14391
16212
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -14602,6 +16423,55 @@ static int32_t llama_chat_apply_template_internal(
14602
16423
  ss << message->content << "</s>";
14603
16424
  }
14604
16425
  }
16426
+ } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
16427
+ // openchat/openchat-3.5-0106,
16428
+ for (auto message : chat) {
16429
+ std::string role(message->role);
16430
+ if (role == "system") {
16431
+ ss << message->content << "<|end_of_turn|>";
16432
+ } else {
16433
+ role[0] = toupper(role[0]);
16434
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
16435
+ }
16436
+ }
16437
+ if (add_ass) {
16438
+ ss << "GPT4 Correct Assistant:";
16439
+ }
16440
+ } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
16441
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
16442
+ for (auto message : chat) {
16443
+ std::string role(message->role);
16444
+ if (role == "system") {
16445
+ // Orca-Vicuna variant uses a system prefix
16446
+ if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
16447
+ ss << "SYSTEM: " << message->content << "\n";
16448
+ } else {
16449
+ ss << message->content << "\n\n";
16450
+ }
16451
+ } else if (role == "user") {
16452
+ ss << "USER: " << message->content << "\n";
16453
+ } else if (role == "assistant") {
16454
+ ss << "ASSISTANT: " << message->content << "</s>\n";
16455
+ }
16456
+ }
16457
+ if (add_ass) {
16458
+ ss << "ASSISTANT:";
16459
+ }
16460
+ } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
16461
+ // deepseek-ai/deepseek-coder-33b-instruct
16462
+ for (auto message : chat) {
16463
+ std::string role(message->role);
16464
+ if (role == "system") {
16465
+ ss << message->content;
16466
+ } else if (role == "user") {
16467
+ ss << "### Instruction:\n" << message->content << "\n";
16468
+ } else if (role == "assistant") {
16469
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
16470
+ }
16471
+ }
16472
+ if (add_ass) {
16473
+ ss << "### Response:\n";
16474
+ }
14605
16475
  } else {
14606
16476
  // template not supported
14607
16477
  return -1;
@@ -14651,6 +16521,30 @@ LLAMA_API int32_t llama_chat_apply_template(
14651
16521
  return res;
14652
16522
  }
14653
16523
 
16524
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
16525
+ static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
16526
+ if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
16527
+ return strlen(split_path);
16528
+ }
16529
+ return 0;
16530
+ }
16531
+
16532
+ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
16533
+ std::string str_split_path(split_path);
16534
+ char postfix[32];
16535
+ snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
16536
+ std::string str_postfix(postfix);
16537
+
16538
+ // check if dest ends with postfix
16539
+ int size_prefix = str_split_path.size() - str_postfix.size();
16540
+ if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
16541
+ snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
16542
+ return size_prefix;
16543
+ }
16544
+
16545
+ return 0;
16546
+ }
16547
+
14654
16548
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
14655
16549
  struct llama_timings result = {
14656
16550
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,