llama_cpp 0.14.3 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
- #ifdef GGML_USE_CUBLAS
10
+ #ifdef GGML_USE_CUDA
11
11
  # include "ggml-cuda.h"
12
12
  #elif defined(GGML_USE_CLBLAST)
13
13
  # include "ggml-opencl.h"
@@ -52,12 +52,16 @@
52
52
  #define NOMINMAX
53
53
  #endif
54
54
  #include <windows.h>
55
+ #ifndef PATH_MAX
56
+ #define PATH_MAX MAX_PATH
57
+ #endif
55
58
  #include <io.h>
56
59
  #endif
57
60
 
58
61
  #include <algorithm>
59
62
  #include <array>
60
63
  #include <cassert>
64
+ #include <cctype>
61
65
  #include <cfloat>
62
66
  #include <cinttypes>
63
67
  #include <climits>
@@ -68,7 +72,6 @@
68
72
  #include <cstdio>
69
73
  #include <cstring>
70
74
  #include <ctime>
71
- #include <cwctype>
72
75
  #include <forward_list>
73
76
  #include <fstream>
74
77
  #include <functional>
@@ -192,6 +195,7 @@ enum llm_arch {
192
195
  LLM_ARCH_LLAMA,
193
196
  LLM_ARCH_FALCON,
194
197
  LLM_ARCH_BAICHUAN,
198
+ LLM_ARCH_GROK,
195
199
  LLM_ARCH_GPT2,
196
200
  LLM_ARCH_GPTJ,
197
201
  LLM_ARCH_GPTNEOX,
@@ -214,6 +218,7 @@ enum llm_arch {
214
218
  LLM_ARCH_GEMMA,
215
219
  LLM_ARCH_STARCODER2,
216
220
  LLM_ARCH_MAMBA,
221
+ LLM_ARCH_XVERSE,
217
222
  LLM_ARCH_COMMAND_R,
218
223
  LLM_ARCH_UNKNOWN,
219
224
  };
@@ -221,6 +226,7 @@ enum llm_arch {
221
226
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
222
227
  { LLM_ARCH_LLAMA, "llama" },
223
228
  { LLM_ARCH_FALCON, "falcon" },
229
+ { LLM_ARCH_GROK, "grok" },
224
230
  { LLM_ARCH_GPT2, "gpt2" },
225
231
  { LLM_ARCH_GPTJ, "gptj" },
226
232
  { LLM_ARCH_GPTNEOX, "gptneox" },
@@ -244,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
244
250
  { LLM_ARCH_GEMMA, "gemma" },
245
251
  { LLM_ARCH_STARCODER2, "starcoder2" },
246
252
  { LLM_ARCH_MAMBA, "mamba" },
253
+ { LLM_ARCH_XVERSE, "xverse" },
247
254
  { LLM_ARCH_COMMAND_R, "command-r" },
248
255
  { LLM_ARCH_UNKNOWN, "(unknown)" },
249
256
  };
@@ -290,6 +297,10 @@ enum llm_kv {
290
297
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
291
298
  LLM_KV_ROPE_SCALING_FINETUNED,
292
299
 
300
+ LLM_KV_SPLIT_NO,
301
+ LLM_KV_SPLIT_COUNT,
302
+ LLM_KV_SPLIT_TENSORS_COUNT,
303
+
293
304
  LLM_KV_SSM_INNER_SIZE,
294
305
  LLM_KV_SSM_CONV_KERNEL,
295
306
  LLM_KV_SSM_STATE_SIZE,
@@ -355,6 +366,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
355
366
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
356
367
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
357
368
 
369
+ { LLM_KV_SPLIT_NO, "split.no" },
370
+ { LLM_KV_SPLIT_COUNT, "split.count" },
371
+ { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
372
+
358
373
  { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
359
374
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
360
375
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
@@ -411,9 +426,12 @@ enum llm_tensor {
411
426
  LLM_TENSOR_FFN_DOWN,
412
427
  LLM_TENSOR_FFN_UP,
413
428
  LLM_TENSOR_FFN_ACT,
414
- LLM_TENSOR_FFN_DOWN_EXP,
429
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
415
430
  LLM_TENSOR_FFN_GATE_EXP,
416
431
  LLM_TENSOR_FFN_UP_EXP,
432
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
433
+ LLM_TENSOR_FFN_GATE_EXPS,
434
+ LLM_TENSOR_FFN_UP_EXPS,
417
435
  LLM_TENSOR_ATTN_Q_NORM,
418
436
  LLM_TENSOR_ATTN_K_NORM,
419
437
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -448,6 +466,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
448
466
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
449
467
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
450
468
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
469
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
470
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
471
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
451
472
  },
452
473
  },
453
474
  {
@@ -483,6 +504,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
483
504
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
484
505
  },
485
506
  },
507
+ {
508
+ LLM_ARCH_GROK,
509
+ {
510
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
511
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
512
+ { LLM_TENSOR_OUTPUT, "output" },
513
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
514
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
515
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
516
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
517
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
518
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
519
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
520
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
521
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
522
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
523
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
524
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
525
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
526
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
527
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
528
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
529
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
530
+ },
531
+ },
486
532
  {
487
533
  LLM_ARCH_GPT2,
488
534
  {
@@ -548,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
548
594
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
549
595
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
550
596
  { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
597
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
598
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
599
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
551
600
  },
552
601
  },
553
602
  {
@@ -843,6 +892,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
843
892
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
844
893
  },
845
894
  },
895
+ {
896
+ LLM_ARCH_XVERSE,
897
+ {
898
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
899
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
900
+ { LLM_TENSOR_OUTPUT, "output" },
901
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
902
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
903
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
904
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
905
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
906
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
907
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
908
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
909
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
910
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
911
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
912
+ },
913
+ },
846
914
  {
847
915
  LLM_ARCH_COMMAND_R,
848
916
  {
@@ -1030,7 +1098,7 @@ struct llama_file {
1030
1098
  size_t size;
1031
1099
 
1032
1100
  llama_file(const char * fname, const char * mode) {
1033
- fp = std::fopen(fname, mode);
1101
+ fp = ggml_fopen(fname, mode);
1034
1102
  if (fp == NULL) {
1035
1103
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1036
1104
  }
@@ -1099,6 +1167,7 @@ struct llama_file {
1099
1167
  }
1100
1168
  }
1101
1169
  };
1170
+ using llama_files = std::vector<std::unique_ptr<llama_file>>;
1102
1171
 
1103
1172
  struct llama_mmap {
1104
1173
  void * addr;
@@ -1299,6 +1368,7 @@ struct llama_mmap {
1299
1368
  }
1300
1369
  #endif
1301
1370
  };
1371
+ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
1302
1372
 
1303
1373
  // Represents some region of memory being locked using mlock or VirtualLock;
1304
1374
  // will automatically unlock on destruction.
@@ -1448,6 +1518,7 @@ struct llama_mlock {
1448
1518
  static void raw_unlock(const void * addr, size_t len) {}
1449
1519
  #endif
1450
1520
  };
1521
+ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1451
1522
 
1452
1523
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1453
1524
  std::vector<char> result(8, 0);
@@ -1467,7 +1538,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1467
1538
  static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
1468
1539
  ggml_backend_buffer_type_t buft = nullptr;
1469
1540
 
1470
- #if defined(GGML_USE_CUBLAS)
1541
+ #if defined(GGML_USE_CUDA)
1471
1542
  // host buffers should only be used when data is expected to be copied to/from the GPU
1472
1543
  if (host_buffer) {
1473
1544
  buft = ggml_backend_cuda_host_buffer_type();
@@ -1497,7 +1568,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1497
1568
 
1498
1569
  #ifdef GGML_USE_METAL
1499
1570
  buft = ggml_backend_metal_buffer_type();
1500
- #elif defined(GGML_USE_CUBLAS)
1571
+ #elif defined(GGML_USE_CUDA)
1501
1572
  buft = ggml_backend_cuda_buffer_type(gpu);
1502
1573
  #elif defined(GGML_USE_VULKAN)
1503
1574
  buft = ggml_backend_vk_buffer_type(gpu);
@@ -1523,7 +1594,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1523
1594
  static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1524
1595
  ggml_backend_buffer_type_t buft = nullptr;
1525
1596
 
1526
- #ifdef GGML_USE_CUBLAS
1597
+ #ifdef GGML_USE_CUDA
1527
1598
  if (ggml_backend_cuda_get_device_count() > 1) {
1528
1599
  buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1529
1600
  }
@@ -1544,7 +1615,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1544
1615
  }
1545
1616
 
1546
1617
  static size_t llama_get_device_count() {
1547
- #if defined(GGML_USE_CUBLAS)
1618
+ #if defined(GGML_USE_CUDA)
1548
1619
  return ggml_backend_cuda_get_device_count();
1549
1620
  #elif defined(GGML_USE_SYCL)
1550
1621
  return ggml_backend_sycl_get_device_count();
@@ -1556,7 +1627,7 @@ static size_t llama_get_device_count() {
1556
1627
  }
1557
1628
 
1558
1629
  static size_t llama_get_device_memory(int device) {
1559
- #if defined(GGML_USE_CUBLAS)
1630
+ #if defined(GGML_USE_CUDA)
1560
1631
  size_t total;
1561
1632
  size_t free;
1562
1633
  ggml_backend_cuda_get_device_memory(device, &total, &free);
@@ -1621,6 +1692,7 @@ enum e_model {
1621
1692
  MODEL_40B,
1622
1693
  MODEL_65B,
1623
1694
  MODEL_70B,
1695
+ MODEL_314B,
1624
1696
  MODEL_SMALL,
1625
1697
  MODEL_MEDIUM,
1626
1698
  MODEL_LARGE,
@@ -1738,6 +1810,7 @@ struct llama_cparams {
1738
1810
  uint32_t n_ctx; // context size used during inference
1739
1811
  uint32_t n_batch;
1740
1812
  uint32_t n_ubatch;
1813
+ uint32_t n_seq_max;
1741
1814
  uint32_t n_threads; // number of threads to use for generation
1742
1815
  uint32_t n_threads_batch; // number of threads to use for batch processing
1743
1816
 
@@ -1803,9 +1876,9 @@ struct llama_layer {
1803
1876
 
1804
1877
  // ff MoE
1805
1878
  struct ggml_tensor * ffn_gate_inp;
1806
- struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1807
- struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1808
- struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1879
+ struct ggml_tensor * ffn_gate_exps;
1880
+ struct ggml_tensor * ffn_down_exps;
1881
+ struct ggml_tensor * ffn_up_exps ;
1809
1882
 
1810
1883
  // ff bias
1811
1884
  struct ggml_tensor * ffn_down_b; // b2
@@ -2023,12 +2096,12 @@ struct llama_model {
2023
2096
  // the model memory buffers for the tensor data
2024
2097
  std::vector<ggml_backend_buffer_t> bufs;
2025
2098
 
2026
- // model memory mapped file
2027
- std::unique_ptr<llama_mmap> mapping;
2099
+ // model memory mapped files
2100
+ llama_mmaps mappings;
2028
2101
 
2029
2102
  // objects representing data potentially being locked in memory
2030
- std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
2031
- llama_mlock mlock_mmap;
2103
+ llama_mlocks mlock_bufs;
2104
+ llama_mlocks mlock_mmaps;
2032
2105
 
2033
2106
  // for quantize-stats only
2034
2107
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -2041,7 +2114,7 @@ struct llama_model {
2041
2114
  ggml_free(ctx);
2042
2115
  }
2043
2116
  for (ggml_backend_buffer_t buf : bufs) {
2044
- #ifdef GGML_USE_CUBLAS
2117
+ #ifdef GGML_USE_CUDA
2045
2118
  if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
2046
2119
  ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
2047
2120
  }
@@ -2060,10 +2133,6 @@ struct llama_context {
2060
2133
  ggml_backend_free(backend);
2061
2134
  }
2062
2135
 
2063
- #ifdef GGML_USE_VULKAN
2064
- ggml_vk_free_cpu_assist();
2065
- #endif
2066
-
2067
2136
  ggml_backend_buffer_free(buf_output);
2068
2137
  }
2069
2138
 
@@ -2100,20 +2169,20 @@ struct llama_context {
2100
2169
  // host buffer for the model output (logits and embeddings)
2101
2170
  ggml_backend_buffer_t buf_output = nullptr;
2102
2171
 
2103
- // decode output (2-dimensional array: [n_tokens][n_vocab])
2104
- size_t logits_size = 0;
2105
- float * logits = nullptr;
2172
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
2173
+ size_t logits_size = 0; // capacity (of floats) for logits
2174
+ float * logits = nullptr;
2175
+
2176
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2177
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2178
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
2106
2179
 
2107
- #ifndef NDEBUG
2108
- // guard against access to unset logits
2109
- std::vector<bool> logits_valid;
2110
- #endif
2111
2180
  bool logits_all = false;
2112
2181
 
2113
- // embeddings output (2-dimensional array: [n_tokens][n_embd])
2182
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
2114
2183
  // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
2115
- size_t embd_size = 0;
2116
- float * embd = nullptr;
2184
+ size_t embd_size = 0; // capacity (of floats) for embeddings
2185
+ float * embd = nullptr;
2117
2186
 
2118
2187
  // sequence embeddings output (map of [n_embd] vectors)
2119
2188
  // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
@@ -2130,14 +2199,15 @@ struct llama_context {
2130
2199
  struct ggml_tensor * inp_tokens; // I32 [n_batch]
2131
2200
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
2132
2201
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2202
+ struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2133
2203
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2134
- struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
2204
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2135
2205
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2136
2206
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2137
2207
  struct ggml_tensor * inp_cls; // I32 [n_batch]
2138
2208
  struct ggml_tensor * inp_s_copy; // I32 [kv_size]
2139
- struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
2140
- struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
2209
+ struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
2210
+ struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
2141
2211
 
2142
2212
  // control vectors
2143
2213
  struct llama_control_vector cvec;
@@ -2792,6 +2862,8 @@ namespace GGUFMeta {
2792
2862
  };
2793
2863
  }
2794
2864
 
2865
+ using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
2866
+
2795
2867
  struct llama_model_loader {
2796
2868
  int n_kv = 0;
2797
2869
  int n_tensors = 0;
@@ -2802,54 +2874,133 @@ struct llama_model_loader {
2802
2874
 
2803
2875
  bool use_mmap = false;
2804
2876
 
2805
- llama_file file;
2877
+ llama_files files;
2806
2878
  llama_ftype ftype;
2807
2879
  llama_fver fver;
2808
2880
 
2809
- std::unique_ptr<llama_mmap> mapping;
2881
+ llama_mmaps mappings;
2882
+
2883
+ // Holds information on a model weight
2884
+ struct llama_tensor_weight {
2885
+ uint16_t idx; // source file index
2886
+ size_t offs; // tensor data offset in the original file
2887
+
2888
+ ggml_tensor * tensor;
2889
+
2890
+ llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2891
+ const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2892
+ offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
2893
+ }
2894
+ };
2895
+ std::vector<llama_tensor_weight> weights;
2896
+
2810
2897
  std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
2811
2898
 
2812
- struct gguf_context * ctx_gguf = NULL;
2813
- struct ggml_context * ctx_meta = NULL;
2899
+ struct gguf_context * meta = NULL;
2900
+ std::vector<ggml_context *> contexts;
2814
2901
 
2815
2902
  std::string arch_name;
2816
2903
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2817
2904
 
2818
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
2905
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
2819
2906
  int trace = 0;
2820
2907
  if (getenv("LLAMA_TRACE")) {
2821
2908
  trace = atoi(getenv("LLAMA_TRACE"));
2822
2909
  }
2823
2910
 
2824
- struct gguf_init_params params = {
2825
- /*.no_alloc = */ true,
2826
- /*.ctx = */ &ctx_meta,
2827
- };
2828
-
2829
2911
  if (param_overrides_p != nullptr) {
2830
2912
  for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
2831
2913
  kv_overrides.insert({std::string(p->key), *p});
2832
2914
  }
2833
2915
  }
2834
2916
 
2835
- ctx_gguf = gguf_init_from_file(fname.c_str(), params);
2836
- if (!ctx_gguf) {
2917
+ struct ggml_context * ctx = NULL;
2918
+ struct gguf_init_params params = {
2919
+ /*.no_alloc = */ true,
2920
+ /*.ctx = */ &ctx,
2921
+ };
2922
+
2923
+ meta = gguf_init_from_file(fname.c_str(), params);
2924
+ if (!meta) {
2837
2925
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
2838
2926
  }
2839
2927
 
2840
2928
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2841
2929
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2842
2930
 
2843
- n_kv = gguf_get_n_kv(ctx_gguf);
2844
- n_tensors = gguf_get_n_tensors(ctx_gguf);
2931
+ // Save tensors data offset of the main file.
2932
+ // For subsidiary files, `meta` tensor data offset must not be used,
2933
+ // so we build a unified tensors index for weights.
2934
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2935
+ weights.emplace_back(0, cur->name, meta, cur);
2936
+ }
2937
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
2938
+ contexts.emplace_back(ctx);
2939
+
2940
+ uint16_t n_split = 0;
2941
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
2942
+
2943
+ // Load additional GGML contexts
2944
+ if (n_split > 1) {
2945
+ uint16_t idx = 0;
2946
+ get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
2947
+ if (idx != 0) {
2948
+ throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
2949
+ }
2950
+
2951
+ char split_prefix[PATH_MAX] = {0};
2952
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
2953
+ throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
2954
+ }
2955
+
2956
+ if (trace > 0) {
2957
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
2958
+ }
2959
+
2960
+ char split_path[PATH_MAX] = {0};
2961
+ for (idx = 1; idx < n_split; idx++) {
2962
+ llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
2963
+
2964
+ struct gguf_init_params split_params = {
2965
+ /*.no_alloc = */ true,
2966
+ /*.ctx = */ &ctx,
2967
+ };
2968
+ struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
2969
+ if (!ctx_gguf) {
2970
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
2971
+ }
2972
+
2973
+ // Save tensors data offset info of the shard.
2974
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2975
+ weights.emplace_back(idx, cur->name, ctx_gguf, cur);
2976
+ }
2977
+ files.emplace_back(new llama_file(split_path, "rb"));
2978
+ contexts.emplace_back(ctx);
2979
+
2980
+ gguf_free(ctx_gguf);
2981
+ }
2982
+
2983
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
2984
+
2985
+ // sanity check
2986
+ {
2987
+ const int n_tensors_loaded = (int) weights.size();
2988
+ if (n_tensors != n_tensors_loaded) {
2989
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
2990
+ }
2991
+ }
2992
+
2993
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
2994
+ }
2995
+
2996
+ n_kv = gguf_get_n_kv(meta);
2997
+ n_tensors = weights.size();
2845
2998
 
2846
- fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
2999
+ fver = (enum llama_fver) gguf_get_version(meta);
2847
3000
 
2848
- for (int i = 0; i < n_tensors; i++) {
2849
- const char * name = gguf_get_tensor_name(ctx_gguf, i);
2850
- struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
2851
- n_elements += ggml_nelements(t);
2852
- n_bytes += ggml_nbytes(t);
3001
+ for (auto & w : weights) {
3002
+ n_elements += ggml_nelements(w.tensor);
3003
+ n_bytes += ggml_nbytes(w.tensor);
2853
3004
  }
2854
3005
 
2855
3006
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -2864,7 +3015,8 @@ struct llama_model_loader {
2864
3015
  enum ggml_type type_max = GGML_TYPE_F32;
2865
3016
 
2866
3017
  for (int i = 0; i < n_tensors; i++) {
2867
- enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
3018
+ const ggml_tensor * tensor = weights.at(i).tensor;
3019
+ enum ggml_type type = tensor->type;
2868
3020
 
2869
3021
  n_type[type]++;
2870
3022
 
@@ -2874,8 +3026,8 @@ struct llama_model_loader {
2874
3026
  }
2875
3027
 
2876
3028
  if (trace > 0) {
2877
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2878
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
3029
+ const uint16_t sid = weights.at(i).idx;
3030
+ LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
2879
3031
  }
2880
3032
  }
2881
3033
 
@@ -2897,6 +3049,7 @@ struct llama_model_loader {
2897
3049
  case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2898
3050
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2899
3051
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
3052
+ case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
2900
3053
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2901
3054
  case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2902
3055
  case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
@@ -2911,22 +3064,23 @@ struct llama_model_loader {
2911
3064
  ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
2912
3065
 
2913
3066
  {
2914
- const int kid = gguf_find_key(ctx_gguf, "general.file_type");
3067
+ const int kid = gguf_find_key(meta, "general.file_type");
2915
3068
  if (kid >= 0) {
2916
- ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
3069
+ ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
2917
3070
  }
2918
3071
  }
2919
3072
 
2920
3073
  LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
3074
+
2921
3075
  for (int i = 0; i < n_kv; i++) {
2922
- const char * name = gguf_get_key(ctx_gguf, i);
2923
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
3076
+ const char * name = gguf_get_key(meta, i);
3077
+ const enum gguf_type type = gguf_get_kv_type(meta, i);
2924
3078
  const std::string type_name =
2925
3079
  type == GGUF_TYPE_ARRAY
2926
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
3080
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
2927
3081
  : gguf_type_name(type);
2928
3082
 
2929
- std::string value = gguf_kv_to_str(ctx_gguf, i);
3083
+ std::string value = gguf_kv_to_str(meta, i);
2930
3084
  const size_t MAX_VALUE_LEN = 40;
2931
3085
  if (value.size() > MAX_VALUE_LEN) {
2932
3086
  value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -2955,18 +3109,18 @@ struct llama_model_loader {
2955
3109
  }
2956
3110
 
2957
3111
  ~llama_model_loader() {
2958
- if (ctx_gguf) {
2959
- gguf_free(ctx_gguf);
3112
+ if (meta) {
3113
+ gguf_free(meta);
2960
3114
  }
2961
- if (ctx_meta) {
2962
- ggml_free(ctx_meta);
3115
+ for (auto * ctx : contexts) {
3116
+ ggml_free(ctx);
2963
3117
  }
2964
3118
  }
2965
3119
 
2966
3120
  template<typename T>
2967
3121
  typename std::enable_if<std::is_integral<T>::value, bool>::type
2968
3122
  get_arr_n(const std::string & key, T & result, const bool required = true) {
2969
- const int kid = gguf_find_key(ctx_gguf, key.c_str());
3123
+ const int kid = gguf_find_key(meta, key.c_str());
2970
3124
 
2971
3125
  if (kid < 0) {
2972
3126
  if (required) {
@@ -2976,7 +3130,7 @@ struct llama_model_loader {
2976
3130
  }
2977
3131
 
2978
3132
  struct GGUFMeta::ArrayInfo arr_info =
2979
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
3133
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
2980
3134
 
2981
3135
 
2982
3136
  result = arr_info.length;
@@ -2996,7 +3150,7 @@ struct llama_model_loader {
2996
3150
  const struct llama_model_kv_override * override =
2997
3151
  it != kv_overrides.end() ? &it->second : nullptr;
2998
3152
 
2999
- const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
3153
+ const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3000
3154
 
3001
3155
  if (required && !found) {
3002
3156
  throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -3019,28 +3173,57 @@ struct llama_model_loader {
3019
3173
  }
3020
3174
 
3021
3175
  const char * get_tensor_name(int i) const {
3022
- return gguf_get_tensor_name(ctx_gguf, i);
3176
+ return weights.at(i).tensor->name;
3177
+ }
3178
+
3179
+ const llama_tensor_weight * get_weight(const char * name) const {
3180
+ for (const auto & weight : weights) {
3181
+ if (strcmp(name, weight.tensor->name) == 0) {
3182
+ return &weight;
3183
+ }
3184
+ }
3185
+ return nullptr;
3186
+ }
3187
+
3188
+ const llama_tensor_weight & require_weight(const char * name) const {
3189
+ const llama_tensor_weight * weight = get_weight(name);
3190
+ if (!weight) {
3191
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3192
+ }
3193
+ return *weight;
3023
3194
  }
3024
3195
 
3025
3196
  struct ggml_tensor * get_tensor_meta(const char * name) const {
3026
- return ggml_get_tensor(ctx_meta, name);
3197
+ const auto * weight = get_weight(name);
3198
+ if (!weight) {
3199
+ return nullptr;
3200
+ }
3201
+ return weight->tensor;
3202
+ }
3203
+
3204
+ struct ggml_tensor * require_tensor_meta(const char * name) const {
3205
+ struct ggml_tensor * tensor = get_tensor_meta(name);
3206
+ if (!tensor) {
3207
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3208
+ }
3209
+ return tensor;
3027
3210
  }
3028
3211
 
3029
3212
  struct ggml_tensor * get_tensor_meta(int i) const {
3030
3213
  return get_tensor_meta(get_tensor_name(i));
3031
3214
  }
3032
3215
 
3033
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
3034
- struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
3035
- ggml_set_name(tensor, ggml_get_name(meta));
3216
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3217
+ struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3218
+ ggml_set_name(tensor, ggml_get_name(cur));
3036
3219
 
3037
3220
  n_created++;
3038
3221
 
3039
3222
  return tensor;
3040
3223
  }
3041
3224
 
3042
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3043
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
3225
+ const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
3226
+ const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
3044
3227
 
3045
3228
  if (cur == NULL) {
3046
3229
  if (!required) {
@@ -3051,8 +3234,8 @@ struct llama_model_loader {
3051
3234
 
3052
3235
  {
3053
3236
  bool is_ok = true;
3054
- for (size_t i = 0; i < ne.size(); ++i) {
3055
- if (ne[i] != cur->ne[i]) {
3237
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3238
+ if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
3056
3239
  is_ok = false;
3057
3240
  break;
3058
3241
  }
@@ -3066,127 +3249,196 @@ struct llama_model_loader {
3066
3249
  }
3067
3250
  }
3068
3251
 
3069
- return create_tensor_for(ctx, cur);
3252
+ return cur;
3070
3253
  }
3071
3254
 
3072
- void done_getting_tensors() const {
3073
- if (n_created != n_tensors) {
3074
- throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3255
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3256
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3257
+
3258
+ if (cur == NULL) {
3259
+ return NULL;
3075
3260
  }
3261
+
3262
+ return create_tensor_for(ctx, cur);
3076
3263
  }
3077
3264
 
3078
- size_t file_offset(const char * name) const {
3079
- const int idx = gguf_find_tensor(ctx_gguf, name);
3265
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
3266
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3080
3267
 
3081
- if (idx < 0) {
3082
- throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
3268
+ if (cur == NULL) {
3269
+ return NULL;
3083
3270
  }
3084
3271
 
3085
- return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
3086
- }
3272
+ if (cur->type != base->type) {
3273
+ throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
3274
+ }
3087
3275
 
3088
- void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
3089
- // prefetch the whole file - all the data is needed anyway
3090
- if (use_mmap) {
3091
- mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
3276
+ std::array<int64_t, GGML_MAX_DIMS> dims;
3277
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3278
+ dims[i] = i < ne.size() ? ne[i] : 1;
3092
3279
  }
3093
3280
 
3094
- // compute the total size of all tensors for progress reporting
3095
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
3096
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
3097
- size_data += ggml_nbytes(cur);
3281
+ struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
3282
+ dims[0], dims[1], dims[2], dims[3],
3283
+ cur->nb[1], cur->nb[2], cur->nb[3],
3284
+ offset);
3285
+
3286
+ ggml_set_name(tensor, name.c_str());
3287
+
3288
+ n_created++;
3289
+
3290
+ return tensor;
3291
+ }
3292
+
3293
+ void done_getting_tensors() const {
3294
+ if (n_created != n_tensors) {
3295
+ throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3098
3296
  }
3297
+ }
3099
3298
 
3100
- if (use_mmap && mapping) {
3101
- if (lmlock) {
3102
- lmlock->init(mapping->addr);
3299
+ void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
3300
+ if (use_mmap) {
3301
+ mappings.reserve(files.size());
3302
+ mmaps_used.reserve(files.size());
3303
+ for (const auto & file : files) {
3304
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3305
+ mmaps_used.emplace_back(mapping->size, 0);
3306
+ if (mlock_mmaps) {
3307
+ std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3308
+ mlock_mmap->init(mapping->addr);
3309
+ mlock_mmaps->emplace_back(std::move(mlock_mmap));
3310
+ }
3311
+ mappings.emplace_back(std::move(mapping));
3103
3312
  }
3104
- mmap_used_first = mapping->size;
3313
+ }
3314
+
3315
+ // compute the total size of all tensors for progress reporting
3316
+ for (auto & w : weights) {
3317
+ size_data += ggml_nbytes(w.tensor);
3105
3318
  }
3106
3319
  }
3107
3320
 
3108
- void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
3109
- GGML_ASSERT(mapping);
3321
+ void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3322
+ GGML_ASSERT(!mappings.empty());
3323
+ const auto & mapping = mappings.at(idx);
3110
3324
 
3111
3325
  *first = mapping->size;
3112
3326
  *last = 0;
3327
+ *addr = mapping->addr;
3113
3328
  for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
3114
- const size_t offs = file_offset(ggml_get_name(tensor));
3115
- *first = std::min(*first, offs);
3116
- *last = std::max(*last, offs + ggml_nbytes(tensor));
3329
+ try {
3330
+ const auto * weight = get_weight(ggml_get_name(tensor));
3331
+ if (!weight) {
3332
+ continue;
3333
+ }
3334
+ if (weight->idx != idx) {
3335
+ continue;
3336
+ }
3337
+ *first = std::min(*first, weight->offs);
3338
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
3339
+ } catch(...) {
3340
+ // the tensor is not in the model
3341
+ }
3117
3342
  }
3118
3343
  }
3119
3344
 
3120
3345
  // for backwards compatibility, does not support ggml-backend
3121
3346
  void load_data_for(struct ggml_tensor * cur) const {
3122
- const size_t offs = file_offset(ggml_get_name(cur));
3347
+ const auto & w = require_weight(ggml_get_name(cur));
3123
3348
 
3124
- if (use_mmap && mapping) {
3349
+ if (use_mmap) {
3350
+ const auto & mapping = mappings.at(w.idx);
3125
3351
  if (cur->data == nullptr) {
3126
- cur->data = (uint8_t *)mapping->addr + offs;
3352
+ cur->data = (uint8_t *)mapping->addr + w.offs;
3127
3353
  } else {
3128
- memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
3354
+ memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
3129
3355
  }
3130
3356
  } else {
3131
3357
  GGML_ASSERT(cur->data != nullptr);
3132
- file.seek(offs, SEEK_SET);
3133
- file.read_raw(cur->data, ggml_nbytes(cur));
3358
+ GGML_ASSERT(w.idx < files.size());
3359
+ const auto & file = files.at(w.idx);
3360
+ file->seek(w.offs, SEEK_SET);
3361
+ file->read_raw(cur->data, ggml_nbytes(cur));
3134
3362
  }
3135
3363
  }
3136
3364
 
3137
3365
  size_t size_done = 0;
3138
3366
  size_t size_data = 0;
3139
- size_t mmap_used_first = -1;
3140
- size_t mmap_used_last = 0;
3367
+ std::vector<std::pair<size_t, size_t>> mmaps_used;
3141
3368
 
3142
3369
  // Returns false if cancelled by progress_callback
3143
- bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
3144
- GGML_ASSERT(size_data != 0 && "call init_mapping() first");
3370
+ bool load_all_data(
3371
+ struct ggml_context * ctx,
3372
+ llama_buf_map & bufs_mmap,
3373
+ llama_mlocks * lmlocks,
3374
+ llama_progress_callback progress_callback,
3375
+ void * progress_callback_user_data) {
3376
+ GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3145
3377
 
3146
3378
  std::vector<no_init<uint8_t>> read_buf;
3147
-
3148
3379
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3380
+ const auto * weight = get_weight(ggml_get_name(cur));
3381
+ if (weight == nullptr) {
3382
+ // this can happen with split experts models
3383
+ continue;
3384
+ }
3385
+
3149
3386
  if (progress_callback) {
3150
3387
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
3151
3388
  return false;
3152
3389
  }
3153
3390
  }
3154
3391
 
3155
- const size_t offs = file_offset(ggml_get_name(cur));
3392
+ size_t n_size = ggml_nbytes(cur);
3156
3393
 
3157
- if (use_mmap && mapping) {
3394
+ if (use_mmap) {
3395
+ const auto & mapping = mappings.at(weight->idx);
3396
+ ggml_backend_buffer_t buf_mmap = nullptr;
3397
+ if (bufs_mmap.count(weight->idx)) {
3398
+ buf_mmap = bufs_mmap.at(weight->idx);
3399
+ }
3400
+ GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3158
3401
  if (buf_mmap && cur->data == nullptr) {
3159
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
3160
- if (lmlock) {
3161
- lmlock->grow_to(offs + ggml_nbytes(cur));
3402
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3403
+ if (lmlocks) {
3404
+ const auto & lmlock = lmlocks->at(weight->idx);
3405
+ lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3162
3406
  }
3163
- mmap_used_first = std::min(mmap_used_first, offs);
3164
- mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
3407
+
3408
+ auto & mmap_used = mmaps_used[weight->idx];
3409
+ mmap_used.first = std::min(mmap_used.first, weight->offs);
3410
+ mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3165
3411
  } else {
3166
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
3412
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3167
3413
  }
3168
3414
  } else {
3415
+ GGML_ASSERT(weight->idx < files.size());
3416
+ const auto & file = files.at(weight->idx);
3169
3417
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3170
- file.seek(offs, SEEK_SET);
3171
- file.read_raw(cur->data, ggml_nbytes(cur));
3418
+ file->seek(weight->offs, SEEK_SET);
3419
+ file->read_raw(cur->data, ggml_nbytes(cur));
3172
3420
  } else {
3173
3421
  read_buf.resize(ggml_nbytes(cur));
3174
- file.seek(offs, SEEK_SET);
3175
- file.read_raw(read_buf.data(), ggml_nbytes(cur));
3176
- ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
3422
+ file->seek(weight->offs, SEEK_SET);
3423
+ file->read_raw(read_buf.data(), ggml_nbytes(cur));
3424
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3177
3425
  }
3178
3426
  }
3179
3427
 
3180
- size_done += ggml_nbytes(cur);
3428
+ size_done += n_size;
3181
3429
  }
3182
3430
 
3183
3431
  // check if this is the last call and do final cleanup
3184
3432
  if (size_done >= size_data) {
3185
3433
  // unmap offloaded tensors and metadata
3186
- if (use_mmap && mapping) {
3187
- mapping->unmap_fragment(0, mmap_used_first);
3188
- if (mmap_used_last != 0) {
3189
- mapping->unmap_fragment(mmap_used_last, mapping->size);
3434
+ if (use_mmap) {
3435
+ for (uint32_t idx = 0; idx < mappings.size(); idx++) {
3436
+ const auto & mmap_used = mmaps_used.at(idx);
3437
+ auto & mapping = mappings.at(idx);
3438
+ mapping->unmap_fragment(0, mmap_used.first);
3439
+ if (mmap_used.second != 0) {
3440
+ mapping->unmap_fragment(mmap_used.second, mapping->size);
3441
+ }
3190
3442
  }
3191
3443
  }
3192
3444
  if (progress_callback) {
@@ -3259,6 +3511,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3259
3511
  case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
3260
3512
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
3261
3513
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
3514
+ case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
3262
3515
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
3263
3516
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
3264
3517
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
@@ -3290,6 +3543,7 @@ static const char * llama_model_type_name(e_model type) {
3290
3543
  case MODEL_40B: return "40B";
3291
3544
  case MODEL_65B: return "65B";
3292
3545
  case MODEL_70B: return "70B";
3546
+ case MODEL_314B: return "314B";
3293
3547
  case MODEL_SMALL: return "0.1B";
3294
3548
  case MODEL_MEDIUM: return "0.4B";
3295
3549
  case MODEL_LARGE: return "0.8B";
@@ -3319,7 +3573,7 @@ static void llm_load_hparams(
3319
3573
  llama_model_loader & ml,
3320
3574
  llama_model & model) {
3321
3575
  auto & hparams = model.hparams;
3322
- const gguf_context * ctx = ml.ctx_gguf;
3576
+ const gguf_context * ctx = ml.meta;
3323
3577
 
3324
3578
  // get metadata as string
3325
3579
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -3428,6 +3682,15 @@ static void llm_load_hparams(
3428
3682
  default: model.type = e_model::MODEL_UNKNOWN;
3429
3683
  }
3430
3684
  } break;
3685
+ case LLM_ARCH_GROK:
3686
+ {
3687
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3688
+
3689
+ switch (hparams.n_layer) {
3690
+ case 64: model.type = e_model::MODEL_314B; break;
3691
+ default: model.type = e_model::MODEL_UNKNOWN;
3692
+ }
3693
+ } break;
3431
3694
  case LLM_ARCH_FALCON:
3432
3695
  {
3433
3696
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3679,6 +3942,16 @@ static void llm_load_hparams(
3679
3942
  default: model.type = e_model::MODEL_UNKNOWN;
3680
3943
  }
3681
3944
  } break;
3945
+ case LLM_ARCH_XVERSE:
3946
+ {
3947
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3948
+ switch (hparams.n_layer) {
3949
+ case 32: model.type = e_model::MODEL_7B; break;
3950
+ case 40: model.type = e_model::MODEL_13B; break;
3951
+ case 80: model.type = e_model::MODEL_65B; break;
3952
+ default: model.type = e_model::MODEL_UNKNOWN;
3953
+ }
3954
+ } break;
3682
3955
  case LLM_ARCH_COMMAND_R:
3683
3956
  {
3684
3957
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -3709,7 +3982,7 @@ static void llm_load_vocab(
3709
3982
  llama_model & model) {
3710
3983
  auto & vocab = model.vocab;
3711
3984
 
3712
- struct gguf_context * ctx = ml.ctx_gguf;
3985
+ struct gguf_context * ctx = ml.meta;
3713
3986
 
3714
3987
  const auto kv = LLM_KV(model.arch);
3715
3988
 
@@ -3842,7 +4115,7 @@ static void llm_load_vocab(
3842
4115
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3843
4116
  vocab.linefeed_id = vocab.special_pad_id;
3844
4117
  } else {
3845
- const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
4118
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
3846
4119
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
3847
4120
  vocab.linefeed_id = ids[0];
3848
4121
  }
@@ -4075,6 +4348,7 @@ static bool llm_load_tensors(
4075
4348
 
4076
4349
  const int64_t n_layer = hparams.n_layer;
4077
4350
  const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
4351
+ bool use_mmap_buffer = true;
4078
4352
 
4079
4353
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
4080
4354
  model.buft_input = llama_default_buffer_type_cpu(true);
@@ -4163,6 +4437,10 @@ static bool llm_load_tensors(
4163
4437
 
4164
4438
  // create one context per buffer type
4165
4439
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4440
+
4441
+ // for moe merged tensors
4442
+ ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4443
+
4166
4444
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4167
4445
  for (auto & it : buft_layer_count) {
4168
4446
  struct ggml_init_params params = {
@@ -4189,6 +4467,11 @@ static bool llm_load_tensors(
4189
4467
  const int64_t n_vocab = hparams.n_vocab;
4190
4468
  const int64_t n_vocab_type = hparams.n_vocab_type;
4191
4469
  const int64_t n_ff = hparams.n_ff;
4470
+ const int64_t n_expert = hparams.n_expert;
4471
+
4472
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
4473
+ throw std::runtime_error("model has expert layers but no expert layers are used");
4474
+ }
4192
4475
 
4193
4476
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4194
4477
 
@@ -4243,26 +4526,113 @@ static bool llm_load_tensors(
4243
4526
 
4244
4527
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4245
4528
 
4246
- layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
4247
-
4248
- if (layer.ffn_gate_inp == nullptr) {
4249
- GGML_ASSERT(hparams.n_expert == 0);
4250
- GGML_ASSERT(hparams.n_expert_used == 0);
4251
-
4529
+ if (n_expert == 0) {
4252
4530
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4253
4531
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4254
4532
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4255
4533
  } else {
4256
- GGML_ASSERT(hparams.n_expert > 0);
4257
- GGML_ASSERT(hparams.n_expert_used > 0);
4258
-
4259
- // MoE branch
4260
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
4261
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
4262
- layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
4263
- layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
4534
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4535
+
4536
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4537
+ if (layer.ffn_gate_exps) {
4538
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4539
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4540
+ } else {
4541
+ // merge split expert into a single tensor for compatibility with older models
4542
+ // requires disabling mmap
4543
+ use_mmap_buffer = false;
4544
+
4545
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4546
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4547
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4548
+
4549
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4550
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4551
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4552
+
4553
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4554
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4555
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4556
+
4557
+ for (uint32_t x = 0; x < n_expert; ++x) {
4558
+ // the individual experts are loaded into a view of the merged tensor
4559
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4560
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4561
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4562
+ }
4563
+ }
4564
+ }
4565
+ }
4566
+ } break;
4567
+ case LLM_ARCH_GROK:
4568
+ {
4569
+ if (n_expert == 0) {
4570
+ throw std::runtime_error("Grok model cannot have zero experts");
4571
+ }
4572
+
4573
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4574
+
4575
+ // output
4576
+ {
4577
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4578
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4579
+ // if output is NULL, init from the input tok embed
4580
+ if (model.output == NULL) {
4581
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4582
+ ml.n_created--; // artificial tensor
4583
+ ml.size_data += ggml_nbytes(model.output);
4584
+ }
4585
+ }
4586
+
4587
+ for (int i = 0; i < n_layer; ++i) {
4588
+ ggml_context * ctx_layer = ctx_for_layer(i);
4589
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4590
+
4591
+ auto & layer = model.layers[i];
4592
+
4593
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4594
+
4595
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4596
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4597
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4598
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4599
+
4600
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4601
+
4602
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4603
+
4604
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4605
+
4606
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4607
+ if (layer.ffn_gate_exps) {
4608
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4609
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4610
+ } else {
4611
+ // merge split expert into a single tensor for compatibility with older models
4612
+ // requires disabling mmap
4613
+ use_mmap_buffer = false;
4614
+
4615
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4616
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4617
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4618
+
4619
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4620
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4621
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4622
+
4623
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4624
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4625
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4626
+
4627
+ for (uint32_t x = 0; x < n_expert; ++x) {
4628
+ // the individual experts are loaded into a view of the merged tensor
4629
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4630
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4631
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4264
4632
  }
4265
4633
  }
4634
+
4635
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4266
4636
  }
4267
4637
  } break;
4268
4638
  case LLM_ARCH_BAICHUAN:
@@ -4319,10 +4689,8 @@ static bool llm_load_tensors(
4319
4689
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4320
4690
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4321
4691
 
4322
- if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
4323
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
4324
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
4325
- }
4692
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
4693
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
4326
4694
 
4327
4695
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4328
4696
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -4502,6 +4870,7 @@ static bool llm_load_tensors(
4502
4870
  case LLM_ARCH_MPT:
4503
4871
  {
4504
4872
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4873
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
4505
4874
 
4506
4875
  // output
4507
4876
  {
@@ -4540,6 +4909,12 @@ static bool llm_load_tensors(
4540
4909
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4541
4910
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4542
4911
 
4912
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
4913
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
4914
+
4915
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
4916
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
4917
+
4543
4918
  // AWQ ScaleActivation layer
4544
4919
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
4545
4920
  }
@@ -4986,6 +5361,28 @@ static bool llm_load_tensors(
4986
5361
  layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
4987
5362
  }
4988
5363
  } break;
5364
+ case LLM_ARCH_XVERSE:
5365
+ {
5366
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5367
+ {
5368
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5369
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5370
+ }
5371
+ for (int i = 0; i < n_layer; ++i) {
5372
+ ggml_context * ctx_layer = ctx_for_layer(i);
5373
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5374
+ auto & layer = model.layers[i];
5375
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5376
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5377
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5378
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5379
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5380
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5381
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5382
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5383
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5384
+ }
5385
+ } break;
4989
5386
  case LLM_ARCH_COMMAND_R:
4990
5387
  {
4991
5388
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5024,56 +5421,97 @@ static bool llm_load_tensors(
5024
5421
 
5025
5422
  ml.done_getting_tensors();
5026
5423
 
5027
- ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
5424
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
5425
+ model.mappings.reserve(ml.mappings.size());
5028
5426
 
5029
5427
  // create the backend buffers
5030
- std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
5428
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
5429
+ ctx_bufs.reserve(ctx_map.size());
5430
+
5431
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
5432
+ size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
5433
+ model.bufs.reserve(n_max_backend_buffer);
5031
5434
 
5032
5435
  for (auto & it : ctx_map) {
5033
5436
  ggml_backend_buffer_type_t buft = it.first;
5034
- ggml_context * ctx = it.second;
5035
- ggml_backend_buffer_t buf = nullptr;
5437
+ ggml_context * ctx = it.second;
5438
+
5439
+ llama_buf_map bufs;
5440
+ bufs.reserve(n_max_backend_buffer);
5036
5441
 
5037
5442
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
5038
5443
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
5039
5444
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
5040
- if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
5041
- size_t first, last;
5042
- ml.get_mapping_range(&first, &last, ctx);
5043
- buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
5044
- #ifdef GGML_USE_CUBLAS
5045
- if (n_layer >= n_gpu_layers) {
5046
- ggml_backend_cuda_register_host_buffer(
5445
+ if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
5446
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5447
+ void * addr = nullptr;
5448
+ size_t first, last;
5449
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5450
+ if (first >= last) {
5451
+ continue;
5452
+ }
5453
+ ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
5454
+ if (buf == nullptr) {
5455
+ throw std::runtime_error("unable to allocate backend CPU buffer");
5456
+ }
5457
+ model.bufs.push_back(buf);
5458
+ bufs.emplace(idx, buf);
5459
+ #ifdef GGML_USE_CUDA
5460
+ if (n_layer >= n_gpu_layers) {
5461
+ ggml_backend_cuda_register_host_buffer(
5047
5462
  ggml_backend_buffer_get_base(buf),
5048
5463
  ggml_backend_buffer_get_size(buf));
5049
- }
5464
+ }
5050
5465
  #endif
5466
+ }
5051
5467
  }
5052
5468
  #ifdef GGML_USE_METAL
5053
- else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
5054
- const size_t max_size = ggml_get_max_tensor_size(ctx);
5055
- size_t first, last;
5056
- ml.get_mapping_range(&first, &last, ctx);
5057
- buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
5469
+ else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
5470
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5471
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
5472
+ void * addr = nullptr;
5473
+ size_t first, last;
5474
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5475
+ if (first >= last) {
5476
+ continue;
5477
+ }
5478
+ ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
5479
+ if (buf == nullptr) {
5480
+ throw std::runtime_error("unable to allocate backend metal buffer");
5481
+ }
5482
+ model.bufs.push_back(buf);
5483
+ bufs.emplace(idx, buf);
5484
+ }
5058
5485
  }
5059
5486
  #endif
5060
5487
  else {
5061
- buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
5062
- if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
5488
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
5489
+ if (buf == nullptr) {
5490
+ throw std::runtime_error("unable to allocate backend buffer");
5491
+ }
5492
+ model.bufs.push_back(buf);
5493
+ if (use_mlock && ggml_backend_buffer_is_host(buf)) {
5063
5494
  model.mlock_bufs.emplace_back(new llama_mlock);
5064
5495
  auto & mlock_buf = model.mlock_bufs.back();
5065
5496
  mlock_buf->init (ggml_backend_buffer_get_base(buf));
5066
5497
  mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
5067
5498
  }
5499
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5500
+ bufs.emplace(idx, buf);
5501
+ }
5068
5502
  }
5069
- if (buf == nullptr) {
5503
+
5504
+ if (bufs.empty()) {
5070
5505
  throw std::runtime_error("failed to allocate buffer");
5071
5506
  }
5072
- // indicate that this buffer contains weights
5073
- // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5074
- ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5075
- model.bufs.push_back(buf);
5076
- ctx_bufs.emplace_back(ctx, buf);
5507
+
5508
+ for (auto & buf : bufs) {
5509
+ // indicate that this buffer contains weights
5510
+ // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5511
+ ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5512
+ }
5513
+
5514
+ ctx_bufs.emplace_back(ctx, bufs);
5077
5515
  }
5078
5516
 
5079
5517
  if (llama_supports_gpu_offload()) {
@@ -5105,13 +5543,17 @@ static bool llm_load_tensors(
5105
5543
  // load tensor data
5106
5544
  for (auto & it : ctx_bufs) {
5107
5545
  ggml_context * ctx = it.first;
5108
- ggml_backend_buffer_t buf = it.second;
5109
- if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
5546
+ auto & bufs = it.second;
5547
+ if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
5110
5548
  return false;
5111
5549
  }
5112
5550
  }
5113
5551
 
5114
- model.mapping = std::move(ml.mapping);
5552
+ if (use_mmap_buffer) {
5553
+ for (auto & mapping : ml.mappings) {
5554
+ model.mappings.emplace_back(std::move(mapping));
5555
+ }
5556
+ }
5115
5557
 
5116
5558
  // loading time will be recalculate after the first eval, so
5117
5559
  // we take page faults deferred by mmap() into consideration
@@ -5266,8 +5708,8 @@ static void llm_build_kv_store(
5266
5708
  GGML_ASSERT(kv.size == n_ctx);
5267
5709
 
5268
5710
  // compute the transposed [n_tokens, n_embd] V matrix
5269
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
5270
- //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
5711
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
5712
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
5271
5713
  cb(v_cur_t, "v_cur_t", il);
5272
5714
 
5273
5715
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
@@ -5451,6 +5893,20 @@ static struct ggml_tensor * llm_build_kqv(
5451
5893
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5452
5894
  }
5453
5895
 
5896
+ if (model.arch == LLM_ARCH_GROK) {
5897
+ // need to do the following:
5898
+ // multiply by attn_output_multiplyer of 0.08838834764831845
5899
+ // and then :
5900
+ // kq = 30 * tanh(kq / 30)
5901
+ // before the softmax below
5902
+
5903
+ //try from phi2
5904
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5905
+
5906
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
5907
+ kq = ggml_scale(ctx, kq, 30);
5908
+ }
5909
+
5454
5910
  #if defined(GGML_USE_KOMPUTE)
5455
5911
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
5456
5912
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
@@ -5577,7 +6033,8 @@ struct llm_build_context {
5577
6033
  const float norm_rms_eps;
5578
6034
 
5579
6035
  const int32_t n_tokens;
5580
- const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
6036
+ const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
6037
+ const int32_t n_outputs;
5581
6038
  const int32_t kv_head; // index of where we store new KV data in the cache
5582
6039
  const int32_t n_orig_ctx;
5583
6040
 
@@ -5624,6 +6081,7 @@ struct llm_build_context {
5624
6081
  norm_rms_eps (hparams.f_norm_rms_eps),
5625
6082
  n_tokens (batch.n_tokens),
5626
6083
  n_kv (worst_case ? kv_self.size : kv_self.n),
6084
+ n_outputs (worst_case ? n_tokens : lctx.n_outputs),
5627
6085
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
5628
6086
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5629
6087
  pooling_type (cparams.pooling_type),
@@ -5645,6 +6103,7 @@ struct llm_build_context {
5645
6103
  lctx.inp_tokens = nullptr;
5646
6104
  lctx.inp_embd = nullptr;
5647
6105
  lctx.inp_pos = nullptr;
6106
+ lctx.inp_out_ids = nullptr;
5648
6107
  lctx.inp_KQ_mask = nullptr;
5649
6108
  lctx.inp_KQ_pos = nullptr;
5650
6109
  lctx.inp_K_shift = nullptr;
@@ -5768,6 +6227,13 @@ struct llm_build_context {
5768
6227
  return lctx.inp_pos;
5769
6228
  }
5770
6229
 
6230
+ struct ggml_tensor * build_inp_out_ids() {
6231
+ lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
6232
+ cb(lctx.inp_out_ids, "inp_out_ids", -1);
6233
+ ggml_set_input(lctx.inp_out_ids);
6234
+ return lctx.inp_out_ids;
6235
+ }
6236
+
5771
6237
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
5772
6238
  if (causal) {
5773
6239
  lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
@@ -5824,6 +6290,9 @@ struct llm_build_context {
5824
6290
  struct ggml_cgraph * build_llama() {
5825
6291
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5826
6292
 
6293
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6294
+ int32_t n_tokens = this->n_tokens;
6295
+
5827
6296
  const int64_t n_embd_head = hparams.n_embd_head_v;
5828
6297
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5829
6298
  GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5891,6 +6360,14 @@ struct llm_build_context {
5891
6360
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5892
6361
  }
5893
6362
 
6363
+ if (il == n_layer - 1) {
6364
+ // skip computing output for unused tokens
6365
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6366
+ n_tokens = n_outputs;
6367
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6368
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6369
+ }
6370
+
5894
6371
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5895
6372
  cb(ffn_inp, "ffn_inp", il);
5896
6373
 
@@ -5943,19 +6420,19 @@ struct llm_build_context {
5943
6420
  for (int i = 0; i < n_expert_used; ++i) {
5944
6421
  ggml_tensor * cur_expert;
5945
6422
 
5946
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
6423
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
5947
6424
  cb(cur_up, "ffn_moe_up", il);
5948
6425
 
5949
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
6426
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
5950
6427
  cb(cur_gate, "ffn_moe_gate", il);
5951
6428
 
5952
6429
  cur_gate = ggml_silu(ctx0, cur_gate);
5953
6430
  cb(cur_gate, "ffn_moe_silu", il);
5954
6431
 
5955
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
6432
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
5956
6433
  cb(cur_expert, "ffn_moe_gate_par", il);
5957
6434
 
5958
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6435
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
5959
6436
  cb(cur_expert, "ffn_moe_down", il);
5960
6437
 
5961
6438
  cur_expert = ggml_mul(ctx0, cur_expert,
@@ -6070,6 +6547,13 @@ struct llm_build_context {
6070
6547
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6071
6548
  }
6072
6549
 
6550
+ if (il == n_layer - 1) {
6551
+ // skip computing output for unused tokens
6552
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6553
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6554
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6555
+ }
6556
+
6073
6557
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6074
6558
  cb(ffn_inp, "ffn_inp", il);
6075
6559
 
@@ -6112,6 +6596,111 @@ struct llm_build_context {
6112
6596
  return gf;
6113
6597
  }
6114
6598
 
6599
+ struct ggml_cgraph * build_xverse() {
6600
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6601
+
6602
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6603
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6604
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6605
+
6606
+ struct ggml_tensor * cur;
6607
+ struct ggml_tensor * inpL;
6608
+
6609
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6610
+
6611
+ // inp_pos - contains the positions
6612
+ struct ggml_tensor * inp_pos = build_inp_pos();
6613
+
6614
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6615
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6616
+
6617
+ // positions of the tokens in the KV cache
6618
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6619
+
6620
+ for (int il = 0; il < n_layer; ++il) {
6621
+ struct ggml_tensor * inpSA = inpL;
6622
+
6623
+ cur = llm_build_norm(ctx0, inpL, hparams,
6624
+ model.layers[il].attn_norm, NULL,
6625
+ LLM_NORM_RMS, cb, il);
6626
+ cb(cur, "attn_norm", il);
6627
+
6628
+ // self-attention
6629
+ {
6630
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6631
+ cb(Qcur, "Qcur", il);
6632
+
6633
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6634
+ cb(Kcur, "Kcur", il);
6635
+
6636
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6637
+ cb(Vcur, "Vcur", il);
6638
+
6639
+ Qcur = ggml_rope_custom(
6640
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6641
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6642
+ ext_factor, attn_factor, beta_fast, beta_slow
6643
+ );
6644
+ cb(Qcur, "Qcur", il);
6645
+
6646
+ Kcur = ggml_rope_custom(
6647
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6648
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6649
+ ext_factor, attn_factor, beta_fast, beta_slow
6650
+ );
6651
+ cb(Kcur, "Kcur", il);
6652
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6653
+ model.layers[il].wo, NULL,
6654
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6655
+ }
6656
+
6657
+ if (il == n_layer - 1) {
6658
+ // skip computing output for unused tokens
6659
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6660
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6661
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6662
+ }
6663
+
6664
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6665
+ cb(ffn_inp, "ffn_inp", il);
6666
+
6667
+ // feed-forward network
6668
+ {
6669
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6670
+ model.layers[il].ffn_norm, NULL,
6671
+ LLM_NORM_RMS, cb, il);
6672
+ cb(cur, "ffn_norm", il);
6673
+
6674
+ cur = llm_build_ffn(ctx0, cur,
6675
+ model.layers[il].ffn_up, NULL,
6676
+ model.layers[il].ffn_gate, NULL,
6677
+ model.layers[il].ffn_down, NULL,
6678
+ NULL,
6679
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6680
+ cb(cur, "ffn_out", il);
6681
+ }
6682
+
6683
+ cur = ggml_add(ctx0, cur, ffn_inp);
6684
+ cb(cur, "l_out", il);
6685
+
6686
+ // input for next layer
6687
+ inpL = cur;
6688
+ }
6689
+
6690
+ cur = inpL;
6691
+
6692
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
6693
+ cb(cur, "result_norm", -1);
6694
+
6695
+ // lm_head
6696
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6697
+ cb(cur, "result_output", -1);
6698
+
6699
+ ggml_build_forward_expand(gf, cur);
6700
+
6701
+ return gf;
6702
+ }
6703
+
6115
6704
  struct ggml_cgraph * build_falcon() {
6116
6705
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6117
6706
 
@@ -6185,6 +6774,14 @@ struct llm_build_context {
6185
6774
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6186
6775
  }
6187
6776
 
6777
+ if (il == n_layer - 1) {
6778
+ // skip computing output for unused tokens
6779
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6780
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6781
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6782
+ attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
6783
+ }
6784
+
6188
6785
  struct ggml_tensor * ffn_inp = cur;
6189
6786
 
6190
6787
  // feed forward
@@ -6225,6 +6822,214 @@ struct llm_build_context {
6225
6822
  return gf;
6226
6823
  }
6227
6824
 
6825
+ struct ggml_cgraph * build_grok() {
6826
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6827
+
6828
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6829
+ int32_t n_tokens = this->n_tokens;
6830
+
6831
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6832
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6833
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6834
+
6835
+ struct ggml_tensor * cur;
6836
+ struct ggml_tensor * inpL;
6837
+
6838
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6839
+
6840
+ // multiply by embedding_multiplier_scale of 78.38367176906169
6841
+ inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
6842
+
6843
+ // inp_pos - contains the positions
6844
+ struct ggml_tensor * inp_pos = build_inp_pos();
6845
+
6846
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6847
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6848
+
6849
+ for (int il = 0; il < n_layer; ++il) {
6850
+ struct ggml_tensor * inpSA = inpL;
6851
+
6852
+ // norm
6853
+ cur = llm_build_norm(ctx0, inpL, hparams,
6854
+ model.layers[il].attn_norm, NULL,
6855
+ LLM_NORM_RMS, cb, il);
6856
+ cb(cur, "attn_norm", il);
6857
+
6858
+
6859
+ // self-attention
6860
+ {
6861
+ // compute Q and K and RoPE them
6862
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6863
+ cb(Qcur, "Qcur", il);
6864
+ if (model.layers[il].bq) {
6865
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6866
+ cb(Qcur, "Qcur", il);
6867
+ }
6868
+
6869
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6870
+ cb(Kcur, "Kcur", il);
6871
+ if (model.layers[il].bk) {
6872
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6873
+ cb(Kcur, "Kcur", il);
6874
+ }
6875
+
6876
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6877
+ cb(Vcur, "Vcur", il);
6878
+ if (model.layers[il].bv) {
6879
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6880
+ cb(Vcur, "Vcur", il);
6881
+ }
6882
+
6883
+ Qcur = ggml_rope_custom(
6884
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6885
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6886
+ ext_factor, attn_factor, beta_fast, beta_slow
6887
+ );
6888
+ cb(Qcur, "Qcur", il);
6889
+
6890
+ Kcur = ggml_rope_custom(
6891
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6892
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6893
+ ext_factor, attn_factor, beta_fast, beta_slow
6894
+ );
6895
+ cb(Kcur, "Kcur", il);
6896
+
6897
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6898
+ model.layers[il].wo, model.layers[il].bo,
6899
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6900
+ }
6901
+
6902
+ if (il == n_layer - 1) {
6903
+ // skip computing output for unused tokens
6904
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6905
+ n_tokens = n_outputs;
6906
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6907
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6908
+ }
6909
+
6910
+ // Grok
6911
+ // if attn_out_norm is present then apply it before adding the input
6912
+ if (model.layers[il].attn_out_norm) {
6913
+ cur = llm_build_norm(ctx0, cur, hparams,
6914
+ model.layers[il].attn_out_norm, NULL,
6915
+ LLM_NORM_RMS, cb, il);
6916
+ cb(cur, "attn_out_norm", il);
6917
+ }
6918
+
6919
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6920
+ cb(ffn_inp, "ffn_inp", il);
6921
+
6922
+ // feed-forward network
6923
+ // MoE branch
6924
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6925
+ model.layers[il].ffn_norm, NULL,
6926
+ LLM_NORM_RMS, cb, il);
6927
+ cb(cur, "ffn_norm", il);
6928
+
6929
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6930
+ cb(logits, "ffn_moe_logits", il);
6931
+
6932
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6933
+ cb(probs, "ffn_moe_probs", il);
6934
+
6935
+ // select experts
6936
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6937
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6938
+
6939
+ ggml_tensor * weights = ggml_get_rows(ctx0,
6940
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6941
+ cb(weights, "ffn_moe_weights", il);
6942
+
6943
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6944
+
6945
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6946
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6947
+
6948
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6949
+ cb(weights, "ffn_moe_weights_norm", il);
6950
+
6951
+ // compute expert outputs
6952
+ ggml_tensor * moe_out = nullptr;
6953
+
6954
+ for (int i = 0; i < n_expert_used; ++i) {
6955
+ ggml_tensor * cur_expert;
6956
+
6957
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6958
+ cb(cur_up, "ffn_moe_up", il);
6959
+
6960
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6961
+ cb(cur_gate, "ffn_moe_gate", il);
6962
+
6963
+ //GeLU
6964
+ cur_gate = ggml_gelu(ctx0, cur_gate);
6965
+ cb(cur_gate, "ffn_moe_gelu", il);
6966
+
6967
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6968
+ cb(cur_expert, "ffn_moe_gate_par", il);
6969
+
6970
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6971
+ cb(cur_expert, "ffn_moe_down", il);
6972
+
6973
+ cur_expert = ggml_mul(ctx0, cur_expert,
6974
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6975
+ cb(cur_expert, "ffn_moe_weighted", il);
6976
+
6977
+ if (i == 0) {
6978
+ moe_out = cur_expert;
6979
+ } else {
6980
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
6981
+ cb(moe_out, "ffn_moe_out", il);
6982
+ }
6983
+ }
6984
+
6985
+ cur = moe_out;
6986
+
6987
+ // Grok
6988
+ // if layer_out_norm is present then apply it before adding the input
6989
+ // Idea: maybe ffn_out_norm is a better name
6990
+ if (model.layers[il].layer_out_norm) {
6991
+ cur = llm_build_norm(ctx0, cur, hparams,
6992
+ model.layers[il].layer_out_norm, NULL,
6993
+ LLM_NORM_RMS, cb, il);
6994
+ cb(cur, "layer_out_norm", il);
6995
+ }
6996
+
6997
+
6998
+ cur = ggml_add(ctx0, cur, ffn_inp);
6999
+ cb(cur, "ffn_out", il);
7000
+
7001
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7002
+ if (layer_dir != nullptr) {
7003
+ cur = ggml_add(ctx0, cur, layer_dir);
7004
+ }
7005
+ cb(cur, "l_out", il);
7006
+
7007
+ // input for next layer
7008
+ inpL = cur;
7009
+ }
7010
+
7011
+ cur = inpL;
7012
+
7013
+ cur = llm_build_norm(ctx0, cur, hparams,
7014
+ model.output_norm, NULL,
7015
+ LLM_NORM_RMS, cb, -1);
7016
+ cb(cur, "result_norm", -1);
7017
+
7018
+ // lm_head
7019
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7020
+
7021
+ // Grok
7022
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
7023
+
7024
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7025
+
7026
+ cb(cur, "result_output", -1);
7027
+
7028
+ ggml_build_forward_expand(gf, cur);
7029
+
7030
+ return gf;
7031
+ }
7032
+
6228
7033
  struct ggml_cgraph * build_starcoder() {
6229
7034
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6230
7035
 
@@ -6279,6 +7084,13 @@ struct llm_build_context {
6279
7084
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6280
7085
  }
6281
7086
 
7087
+ if (il == n_layer - 1) {
7088
+ // skip computing output for unused tokens
7089
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7090
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7091
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7092
+ }
7093
+
6282
7094
  // add the input
6283
7095
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6284
7096
  cb(ffn_inp, "ffn_inp", il);
@@ -6476,6 +7288,13 @@ struct llm_build_context {
6476
7288
  Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6477
7289
  }
6478
7290
 
7291
+ if (il == n_layer - 1) {
7292
+ // skip computing output for unused tokens
7293
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7294
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7295
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
7296
+ }
7297
+
6479
7298
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
6480
7299
  cb(ffn_inp, "ffn_inp", il);
6481
7300
 
@@ -6565,6 +7384,13 @@ struct llm_build_context {
6565
7384
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6566
7385
  }
6567
7386
 
7387
+ if (il == n_layer - 1) {
7388
+ // skip computing output for unused tokens
7389
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7390
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7391
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7392
+ }
7393
+
6568
7394
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6569
7395
  cb(ffn_inp, "ffn_inp", il);
6570
7396
 
@@ -6722,6 +7548,13 @@ struct llm_build_context {
6722
7548
  }
6723
7549
  cb(cur, "kqv_out", il);
6724
7550
 
7551
+ if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
7552
+ // skip computing output for unused tokens
7553
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7554
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7555
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7556
+ }
7557
+
6725
7558
  // re-add the layer input
6726
7559
  cur = ggml_add(ctx0, cur, inpL);
6727
7560
 
@@ -6844,6 +7677,13 @@ struct llm_build_context {
6844
7677
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6845
7678
  }
6846
7679
 
7680
+ if (il == n_layer - 1) {
7681
+ // skip computing output for unused tokens
7682
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7683
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7684
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7685
+ }
7686
+
6847
7687
  // Add the input
6848
7688
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6849
7689
  cb(ffn_inp, "ffn_inp", il);
@@ -6891,6 +7731,7 @@ struct llm_build_context {
6891
7731
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6892
7732
 
6893
7733
  struct ggml_tensor * cur;
7734
+ struct ggml_tensor * pos;
6894
7735
  struct ggml_tensor * inpL;
6895
7736
 
6896
7737
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -6901,6 +7742,16 @@ struct llm_build_context {
6901
7742
  // positions of the tokens in the KV cache
6902
7743
  struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6903
7744
 
7745
+ if (model.pos_embd) {
7746
+ // inp_pos - contains the positions
7747
+ struct ggml_tensor * inp_pos = build_inp_pos();
7748
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7749
+ cb(pos, "pos_embd", -1);
7750
+
7751
+ inpL = ggml_add(ctx0, inpL, pos);
7752
+ cb(inpL, "inpL", -1);
7753
+ }
7754
+
6904
7755
  for (int il = 0; il < n_layer; ++il) {
6905
7756
  struct ggml_tensor * attn_norm;
6906
7757
 
@@ -6935,11 +7786,39 @@ struct llm_build_context {
6935
7786
  cb(Kcur, "Kcur", il);
6936
7787
  cb(Vcur, "Vcur", il);
6937
7788
 
6938
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7789
+ // Q/K Layernorm
7790
+ if (model.layers[il].attn_q_norm) {
7791
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
7792
+ model.layers[il].attn_q_norm,
7793
+ model.layers[il].attn_q_norm_b,
7794
+ LLM_NORM, cb, il);
7795
+ cb(Qcur, "Qcur", il);
6939
7796
 
6940
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7797
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
7798
+ model.layers[il].attn_k_norm,
7799
+ model.layers[il].attn_k_norm_b,
7800
+ LLM_NORM, cb, il);
7801
+ cb(Kcur, "Kcur", il);
7802
+
7803
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7804
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7805
+
7806
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6941
7807
  model.layers[il].wo, model.layers[il].bo,
6942
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7808
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7809
+ } else {
7810
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7811
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7812
+ model.layers[il].wo, model.layers[il].bo,
7813
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7814
+ }
7815
+ }
7816
+
7817
+ if (il == n_layer - 1) {
7818
+ // skip computing output for unused tokens
7819
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7820
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7821
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6943
7822
  }
6944
7823
 
6945
7824
  // Add the input
@@ -7055,6 +7934,13 @@ struct llm_build_context {
7055
7934
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7056
7935
  }
7057
7936
 
7937
+ if (il == n_layer - 1) {
7938
+ // skip computing output for unused tokens
7939
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7940
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7941
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7942
+ }
7943
+
7058
7944
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7059
7945
  cb(ffn_inp, "ffn_inp", il);
7060
7946
 
@@ -7161,6 +8047,13 @@ struct llm_build_context {
7161
8047
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7162
8048
  }
7163
8049
 
8050
+ if (il == n_layer - 1) {
8051
+ // skip computing output for unused tokens
8052
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8053
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8054
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8055
+ }
8056
+
7164
8057
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7165
8058
  cb(ffn_inp, "ffn_inp", il);
7166
8059
 
@@ -7273,6 +8166,13 @@ struct llm_build_context {
7273
8166
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7274
8167
  }
7275
8168
 
8169
+ if (il == n_layer - 1) {
8170
+ // skip computing output for unused tokens
8171
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8172
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8173
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8174
+ }
8175
+
7276
8176
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7277
8177
  cb(ffn_inp, "ffn_inp", il);
7278
8178
 
@@ -7391,6 +8291,14 @@ struct llm_build_context {
7391
8291
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7392
8292
  }
7393
8293
 
8294
+ if (il == n_layer - 1) {
8295
+ // skip computing output for unused tokens
8296
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8297
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8298
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8299
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
8300
+ }
8301
+
7394
8302
  // FF
7395
8303
  {
7396
8304
  ffn_output = llm_build_ffn(ctx0, attn_norm_output,
@@ -7488,6 +8396,14 @@ struct llm_build_context {
7488
8396
 
7489
8397
  cur = attention_norm;
7490
8398
 
8399
+ if (il == n_layer - 1) {
8400
+ // skip computing output for unused tokens
8401
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8402
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8403
+ sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
8404
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8405
+ }
8406
+
7491
8407
  // feed-forward network
7492
8408
  {
7493
8409
  cur = llm_build_ffn(ctx0, cur,
@@ -7580,6 +8496,13 @@ struct llm_build_context {
7580
8496
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7581
8497
  }
7582
8498
 
8499
+ if (il == n_layer - 1) {
8500
+ // skip computing output for unused tokens
8501
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8502
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8503
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8504
+ }
8505
+
7583
8506
  // add the input
7584
8507
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7585
8508
  cb(ffn_inp, "ffn_inp", il);
@@ -7680,6 +8603,13 @@ struct llm_build_context {
7680
8603
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7681
8604
  }
7682
8605
 
8606
+ if (il == n_layer - 1) {
8607
+ // skip computing output for unused tokens
8608
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8609
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8610
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8611
+ }
8612
+
7683
8613
  // add the input
7684
8614
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7685
8615
  cb(ffn_inp, "ffn_inp", il);
@@ -7789,6 +8719,13 @@ struct llm_build_context {
7789
8719
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7790
8720
  }
7791
8721
 
8722
+ if (il == n_layer - 1) {
8723
+ // skip computing output for unused tokens
8724
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8725
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8726
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8727
+ }
8728
+
7792
8729
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7793
8730
  cb(ffn_inp, "ffn_inp", il);
7794
8731
 
@@ -7899,6 +8836,13 @@ struct llm_build_context {
7899
8836
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7900
8837
  }
7901
8838
 
8839
+ if (il == n_layer - 1) {
8840
+ // skip computing output for unused tokens
8841
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8842
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8843
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8844
+ }
8845
+
7902
8846
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7903
8847
  cb(ffn_inp, "ffn_inp", il);
7904
8848
 
@@ -8022,6 +8966,13 @@ struct llm_build_context {
8022
8966
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8023
8967
  }
8024
8968
 
8969
+ if (il == n_layer - 1) {
8970
+ // skip computing output for unused tokens
8971
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8972
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8973
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8974
+ }
8975
+
8025
8976
  // scale_res - scale the hidden states for residual connection
8026
8977
  const float scale_res = scale_depth/sqrtf(float(n_layer));
8027
8978
  cur = ggml_scale(ctx0, cur, scale_res);
@@ -8136,6 +9087,13 @@ struct llm_build_context {
8136
9087
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8137
9088
  }
8138
9089
 
9090
+ if (il == n_layer - 1) {
9091
+ // skip computing output for unused tokens
9092
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9093
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9094
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9095
+ }
9096
+
8139
9097
  struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8140
9098
  cb(sa_out, "sa_out", il);
8141
9099
 
@@ -8248,6 +9206,13 @@ struct llm_build_context {
8248
9206
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8249
9207
  }
8250
9208
 
9209
+ if (il == n_layer - 1) {
9210
+ // skip computing output for unused tokens
9211
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9212
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9213
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9214
+ }
9215
+
8251
9216
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8252
9217
  cb(ffn_inp, "ffn_inp", il);
8253
9218
 
@@ -8395,6 +9360,15 @@ struct llm_build_context {
8395
9360
 
8396
9361
  struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
8397
9362
 
9363
+ if (il == n_layer - 1) {
9364
+ // skip computing output for unused tokens
9365
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9366
+ x = ggml_get_rows(ctx0, x, inp_out_ids);
9367
+ y = ggml_get_rows(ctx0, y, inp_out_ids);
9368
+ z = ggml_get_rows(ctx0, z, inp_out_ids);
9369
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9370
+ }
9371
+
8398
9372
  // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
8399
9373
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
8400
9374
  y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
@@ -8497,6 +9471,14 @@ struct llm_build_context {
8497
9471
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8498
9472
  }
8499
9473
 
9474
+ if (il == n_layer - 1) {
9475
+ // skip computing output for unused tokens
9476
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9477
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9478
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9479
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
9480
+ }
9481
+
8500
9482
  struct ggml_tensor * attn_out = cur;
8501
9483
 
8502
9484
  // feed-forward network
@@ -8648,6 +9630,10 @@ static struct ggml_cgraph * llama_build_graph(
8648
9630
  {
8649
9631
  result = llm.build_falcon();
8650
9632
  } break;
9633
+ case LLM_ARCH_GROK:
9634
+ {
9635
+ result = llm.build_grok();
9636
+ } break;
8651
9637
  case LLM_ARCH_STARCODER:
8652
9638
  {
8653
9639
  result = llm.build_starcoder();
@@ -8725,6 +9711,10 @@ static struct ggml_cgraph * llama_build_graph(
8725
9711
  {
8726
9712
  result = llm.build_mamba();
8727
9713
  } break;
9714
+ case LLM_ARCH_XVERSE:
9715
+ {
9716
+ result = llm.build_xverse();
9717
+ } break;
8728
9718
  case LLM_ARCH_COMMAND_R:
8729
9719
  {
8730
9720
  result = llm.build_command_r();
@@ -8790,9 +9780,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8790
9780
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
8791
9781
  }
8792
9782
 
9783
+ if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
9784
+ GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
9785
+ const int64_t n_tokens = batch.n_tokens;
9786
+
9787
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
9788
+ int32_t * data = (int32_t *) lctx.inp_out_ids->data;
9789
+
9790
+ if (lctx.n_outputs == n_tokens) {
9791
+ for (int i = 0; i < n_tokens; ++i) {
9792
+ data[i] = i;
9793
+ }
9794
+ } else if (batch.logits) {
9795
+ int32_t n_outputs = 0;
9796
+ for (int i = 0; i < n_tokens; ++i) {
9797
+ if (batch.logits[i]) {
9798
+ data[n_outputs++] = i;
9799
+ }
9800
+ }
9801
+ // the graph needs to have been passed the correct number of outputs
9802
+ GGML_ASSERT(lctx.n_outputs == n_outputs);
9803
+ } else if (lctx.n_outputs == 1) {
9804
+ // only keep last output
9805
+ data[0] = n_tokens - 1;
9806
+ } else {
9807
+ GGML_ASSERT(lctx.n_outputs == 0);
9808
+ }
9809
+ }
9810
+
8793
9811
  GGML_ASSERT(
9812
+ // (!a || b) is a logical implication (a -> b)
9813
+ // !hparams.causal_attn -> !cparams.causal_attn
8794
9814
  (hparams.causal_attn || !cparams.causal_attn) &&
8795
- "non-causal attention with generative models is not supported"
9815
+ "causal attention with embedding models is not supported"
8796
9816
  );
8797
9817
 
8798
9818
  if (lctx.inp_KQ_mask) {
@@ -8971,6 +9991,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8971
9991
  }
8972
9992
  }
8973
9993
 
9994
+ // Make sure enough space is available for outputs.
9995
+ // Returns max number of outputs for which space was reserved.
9996
+ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
9997
+ const auto & cparams = lctx.cparams;
9998
+ const auto & hparams = lctx.model.hparams;
9999
+
10000
+ const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
10001
+
10002
+ const auto n_batch = cparams.n_batch;
10003
+ const auto n_vocab = hparams.n_vocab;
10004
+ const auto n_embd = hparams.n_embd;
10005
+
10006
+ // TODO: use a per-batch flag for logits presence instead
10007
+ const bool has_logits = cparams.causal_attn;
10008
+ const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
10009
+
10010
+ const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
10011
+ const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
10012
+
10013
+ if (lctx.output_ids.empty()) {
10014
+ // init, never resized afterwards
10015
+ lctx.output_ids.resize(n_batch);
10016
+ }
10017
+
10018
+ const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
10019
+ const size_t new_size = (logits_size + embd_size) * sizeof(float);
10020
+
10021
+ // alloc only when more than the current capacity is required
10022
+ // TODO: also consider shrinking the buffer
10023
+ if (!lctx.buf_output || prev_size < new_size) {
10024
+ if (lctx.buf_output) {
10025
+ #ifndef NDEBUG
10026
+ // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
10027
+ LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
10028
+ #endif
10029
+ ggml_backend_buffer_free(lctx.buf_output);
10030
+ lctx.buf_output = nullptr;
10031
+ lctx.logits = nullptr;
10032
+ lctx.embd = nullptr;
10033
+ }
10034
+
10035
+ lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
10036
+ if (lctx.buf_output == nullptr) {
10037
+ LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
10038
+ return 0;
10039
+ }
10040
+ }
10041
+
10042
+ float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
10043
+
10044
+ lctx.logits = has_logits ? output_base : nullptr;
10045
+ lctx.embd = has_embd ? output_base + logits_size : nullptr;
10046
+
10047
+ lctx.output_size = n_outputs_max;
10048
+ lctx.logits_size = logits_size;
10049
+ lctx.embd_size = embd_size;
10050
+
10051
+ // set all ids as invalid (negative)
10052
+ std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
10053
+
10054
+ ggml_backend_buffer_clear(lctx.buf_output, 0);
10055
+
10056
+ lctx.n_outputs = 0;
10057
+
10058
+ return n_outputs_max;
10059
+ }
10060
+
10061
+
8974
10062
  static void llama_graph_compute(
8975
10063
  llama_context & lctx,
8976
10064
  ggml_cgraph * gf,
@@ -9046,16 +10134,8 @@ static int llama_decode_internal(
9046
10134
  const int64_t n_embd = hparams.n_embd;
9047
10135
  const int64_t n_vocab = hparams.n_vocab;
9048
10136
 
9049
-
9050
- auto * logits_out = lctx.logits;
9051
-
9052
- #ifndef NDEBUG
9053
- auto & logits_valid = lctx.logits_valid;
9054
- logits_valid.clear();
9055
- logits_valid.resize(n_tokens_all);
9056
-
9057
- memset(logits_out, 0, lctx.logits_size*sizeof(float));
9058
- #endif
10137
+ uint32_t n_outputs = 0;
10138
+ uint32_t n_outputs_prev = 0;
9059
10139
 
9060
10140
  const auto n_ubatch = cparams.n_ubatch;
9061
10141
 
@@ -9064,6 +10144,38 @@ static int llama_decode_internal(
9064
10144
  std::vector<llama_seq_id *> seq_id_arr;
9065
10145
  std::vector<std::vector<llama_seq_id>> seq_id;
9066
10146
 
10147
+ // count outputs
10148
+ if (batch_all.logits) {
10149
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10150
+ n_outputs += batch_all.logits[i] != 0;
10151
+ }
10152
+ } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
10153
+ n_outputs = n_tokens_all;
10154
+ } else {
10155
+ // keep last output only
10156
+ n_outputs = 1;
10157
+ }
10158
+
10159
+ // reserve output buffer
10160
+ if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
10161
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
10162
+ return -2;
10163
+ };
10164
+
10165
+ // set output mappings
10166
+ if (batch_all.logits) {
10167
+ int32_t i_logits = 0;
10168
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10169
+ if (batch_all.logits[i]) {
10170
+ lctx.output_ids[i] = i_logits++;
10171
+ }
10172
+ }
10173
+ } else {
10174
+ for (uint32_t i = 0; i < n_outputs; ++i) {
10175
+ lctx.output_ids[i] = i;
10176
+ }
10177
+ }
10178
+
9067
10179
  for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
9068
10180
  const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
9069
10181
  llama_batch u_batch = {
@@ -9079,6 +10191,27 @@ static int llama_decode_internal(
9079
10191
  /* .all_seq_id = */ batch_all.all_seq_id,
9080
10192
  };
9081
10193
 
10194
+ // count the outputs in this u_batch
10195
+ {
10196
+ int32_t n_outputs_new = 0;
10197
+
10198
+ if (u_batch.logits) {
10199
+ for (uint32_t i = 0; i < n_tokens; i++) {
10200
+ n_outputs_new += u_batch.logits[i] != 0;
10201
+ }
10202
+ } else if (n_outputs == n_tokens_all) {
10203
+ n_outputs_new = n_tokens;
10204
+ } else {
10205
+ // keep last output only
10206
+ if (cur_token + n_tokens >= n_tokens_all) {
10207
+ n_outputs_new = 1;
10208
+ }
10209
+ }
10210
+
10211
+ // needs to happen before the graph is built
10212
+ lctx.n_outputs = n_outputs_new;
10213
+ }
10214
+
9082
10215
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
9083
10216
  GGML_ASSERT(n_threads > 0);
9084
10217
 
@@ -9142,23 +10275,37 @@ static int llama_decode_internal(
9142
10275
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
9143
10276
  struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
9144
10277
 
9145
- if (!hparams.causal_attn) {
10278
+ if (lctx.n_outputs == 0) {
10279
+ // no output
10280
+ res = nullptr;
10281
+ embd = nullptr;
10282
+ } else if (!hparams.causal_attn) {
9146
10283
  res = nullptr; // do not extract logits for embedding models such as BERT
9147
10284
 
9148
10285
  // token or sequence embeddings
9149
10286
  embd = gf->nodes[gf->n_nodes - 1];
9150
10287
 
9151
10288
  GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
9152
- } else {
9153
- if (strcmp(res->name, "result_output") == 0) {
9154
- // the token embeddings could be the second to last tensor, or the third to last tensor
9155
- if (strcmp(embd->name, "result_norm") != 0) {
9156
- embd = gf->nodes[gf->n_nodes - 3];
9157
- GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
9158
- }
9159
- } else {
9160
- GGML_ASSERT(false && "missing result_output tensor");
10289
+ } else if (cparams.embeddings) {
10290
+ // the embeddings could be in the second to last tensor, or any of the previous tensors
10291
+ int i_embd = gf->n_nodes - 2;
10292
+ for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
10293
+ i_embd = gf->n_nodes - i;
10294
+ if (i_embd < 0) { break; }
10295
+ embd = gf->nodes[i_embd];
10296
+ }
10297
+ GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
10298
+
10299
+ // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
10300
+ if (!cparams.causal_attn) {
10301
+ res = nullptr; // do not extract logits when not needed
10302
+ // skip computing logits
10303
+ // TODO: is this safe?
10304
+ gf->n_nodes = i_embd + 1;
9161
10305
  }
10306
+ } else {
10307
+ embd = nullptr; // do not extract embeddings when not needed
10308
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
9162
10309
  }
9163
10310
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
9164
10311
 
@@ -9201,50 +10348,23 @@ static int llama_decode_internal(
9201
10348
  //}
9202
10349
 
9203
10350
  // extract logits
9204
- // TODO: do not compute and extract logits if only embeddings are needed
9205
- // update the graphs to skip "result_output" if logits are not needed
9206
10351
  if (res) {
9207
10352
  ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
9208
10353
  GGML_ASSERT(backend_res != nullptr);
9209
- if (u_batch.logits) {
9210
- int32_t i_first = -1;
9211
- for (uint32_t i = 0; i < n_tokens; i++) {
9212
- if (u_batch.logits[i] && i_first == -1) {
9213
- i_first = (int32_t) i;
9214
- }
9215
- if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
9216
- if (i_first != -1) {
9217
- int i_last = u_batch.logits[i] == 0 ? i : i + 1;
9218
- // extract logits for the range [i_first, i_last)
9219
- // group the requests to minimize the number of calls to the backend
9220
- ggml_backend_tensor_get_async(backend_res, res,
9221
- logits_out + n_vocab*(cur_token + i_first),
9222
- i_first*n_vocab*sizeof(float),
9223
- (i_last - i_first)*n_vocab*sizeof(float));
9224
- i_first = -1;
9225
- }
9226
- }
9227
- #ifndef NDEBUG
9228
- logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
9229
- #endif
9230
- }
9231
- } else if (lctx.logits_all) {
9232
- ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
9233
- #ifndef NDEBUG
9234
- std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
9235
- #endif
9236
- } else {
9237
- if (cur_token + n_tokens >= n_tokens_all) {
9238
- ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
9239
- #ifndef NDEBUG
9240
- logits_valid[0] = true;
9241
- #endif
9242
- }
10354
+ GGML_ASSERT(lctx.logits != nullptr);
10355
+
10356
+ float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
10357
+ const int32_t n_outputs_new = lctx.n_outputs;
10358
+
10359
+ if (n_outputs_new) {
10360
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10361
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
10362
+ ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
9243
10363
  }
9244
10364
  }
9245
10365
 
9246
10366
  // extract embeddings
9247
- if (cparams.embeddings && embd) {
10367
+ if (embd) {
9248
10368
  ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
9249
10369
  GGML_ASSERT(backend_embd != nullptr);
9250
10370
 
@@ -9252,16 +10372,14 @@ static int llama_decode_internal(
9252
10372
  case LLAMA_POOLING_TYPE_NONE:
9253
10373
  {
9254
10374
  // extract token embeddings
9255
- auto & embd_out = lctx.embd;
9256
-
9257
- if (u_batch.logits) {
9258
- //embd_out.resize(n_embd * n_tokens);
9259
- for (uint32_t i = 0; i < n_tokens; i++) {
9260
- if (u_batch.logits[i] == 0) {
9261
- continue;
9262
- }
9263
- ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
9264
- }
10375
+ GGML_ASSERT(lctx.embd != nullptr);
10376
+ float * embd_out = lctx.embd + n_outputs_prev*n_embd;
10377
+ const int32_t n_outputs_new = lctx.n_outputs;
10378
+
10379
+ if (n_outputs_new) {
10380
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10381
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
10382
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
9265
10383
  }
9266
10384
  } break;
9267
10385
  case LLAMA_POOLING_TYPE_CLS:
@@ -9288,6 +10406,7 @@ static int llama_decode_internal(
9288
10406
  } break;
9289
10407
  }
9290
10408
  }
10409
+ n_outputs_prev += lctx.n_outputs;
9291
10410
  }
9292
10411
 
9293
10412
  // wait for the computation to finish (automatically done when obtaining the model output)
@@ -10218,7 +11337,7 @@ struct llm_tokenizer_wpm {
10218
11337
  if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
10219
11338
  continue;
10220
11339
  }
10221
- code = to_lower(code);
11340
+ code = unicode_tolower(code);
10222
11341
  if (type == CODEPOINT_TYPE_WHITESPACE) {
10223
11342
  code = ' ';
10224
11343
  }
@@ -10238,7 +11357,7 @@ struct llm_tokenizer_wpm {
10238
11357
  std::vector<std::string> words;
10239
11358
  while (r < new_str.size()) {
10240
11359
  // if is whitespace
10241
- if (isspace(new_str[r])) {
11360
+ if (isspace(new_str[r], std::locale::classic())) {
10242
11361
  if (r > l) words.push_back(new_str.substr(l, (r - l)));
10243
11362
  l = r + 1;
10244
11363
  r = l;
@@ -10252,18 +11371,12 @@ struct llm_tokenizer_wpm {
10252
11371
  return words;
10253
11372
  }
10254
11373
 
10255
- uint32_t to_lower(uint32_t code) {
10256
- static const std::locale locale("en_US.UTF-8");
10257
- #if defined(_WIN32)
10258
- if (code > 0xFFFF) {
10259
- return code;
10260
- }
10261
- #endif
10262
- return std::tolower(wchar_t(code), locale);
10263
- }
10264
-
10265
11374
  bool is_ascii_punct(uint32_t code) {
10266
- return code < 256 && ispunct(code);
11375
+ if (code > 0xFF) {
11376
+ return false;
11377
+ }
11378
+ auto c = char(static_cast<unsigned char>(code));
11379
+ return ispunct(c, std::locale::classic());
10267
11380
  }
10268
11381
 
10269
11382
  bool is_chinese_char(uint32_t cpt) {
@@ -10508,28 +11621,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
10508
11621
  // grammar - internal
10509
11622
  //
10510
11623
 
10511
- struct llama_partial_utf8 {
10512
- uint32_t value; // bit value so far (unshifted)
10513
- int n_remain; // num bytes remaining; -1 indicates invalid sequence
10514
- };
10515
-
10516
- struct llama_grammar {
10517
- const std::vector<std::vector<llama_grammar_element>> rules;
10518
- std::vector<std::vector<const llama_grammar_element *>> stacks;
10519
-
10520
- // buffer for partially generated UTF-8 sequence from accepted tokens
10521
- llama_partial_utf8 partial_utf8;
10522
- };
10523
-
10524
- struct llama_grammar_candidate {
10525
- size_t index;
10526
- const uint32_t * code_points;
10527
- llama_partial_utf8 partial_utf8;
10528
- };
10529
11624
 
10530
11625
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
10531
11626
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
10532
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
11627
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
10533
11628
  const std::string & src,
10534
11629
  llama_partial_utf8 partial_start) {
10535
11630
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -10731,7 +11826,7 @@ static void llama_grammar_advance_stack(
10731
11826
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
10732
11827
  // produces the N possible stacks if the given char is accepted at those
10733
11828
  // positions
10734
- static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11829
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
10735
11830
  const std::vector<std::vector<llama_grammar_element>> & rules,
10736
11831
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
10737
11832
  const uint32_t chr) {
@@ -11957,7 +13052,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11957
13052
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
11958
13053
  // for getting the current layer as I initially thought, and we need to resort to parsing the
11959
13054
  // tensor name.
11960
- n_layer /= n_expert;
11961
13055
  if (sscanf(name, "blk.%d.", &i_layer) != 1) {
11962
13056
  throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
11963
13057
  }
@@ -11971,30 +13065,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11971
13065
  // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
11972
13066
  // with the quantization of the output tensor
11973
13067
  if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
11974
- int nx = tensor->ne[0];
11975
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
11976
- new_type = GGML_TYPE_Q8_0;
11977
- }
11978
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11979
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11980
- new_type = GGML_TYPE_Q5_K;
11981
- }
11982
- else if (new_type != GGML_TYPE_Q8_0) {
11983
- new_type = GGML_TYPE_Q6_K;
13068
+ if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
13069
+ new_type = qs.params->output_tensor_type;
13070
+ } else {
13071
+ int nx = tensor->ne[0];
13072
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
13073
+ new_type = GGML_TYPE_Q8_0;
13074
+ }
13075
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
13076
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
13077
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13078
+ new_type = GGML_TYPE_Q5_K;
13079
+ }
13080
+ else if (new_type != GGML_TYPE_Q8_0) {
13081
+ new_type = GGML_TYPE_Q6_K;
13082
+ }
11984
13083
  }
11985
13084
  } else if (name == "token_embd.weight") {
11986
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
11987
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11988
- new_type = GGML_TYPE_Q2_K;
11989
- }
11990
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11991
- new_type = GGML_TYPE_IQ3_S;
11992
- }
11993
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11994
- new_type = GGML_TYPE_IQ3_S;
13085
+ if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
13086
+ new_type = qs.params->token_embedding_type;
13087
+ } else {
13088
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
13089
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13090
+ new_type = GGML_TYPE_Q2_K;
13091
+ }
13092
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13093
+ new_type = GGML_TYPE_IQ3_S;
13094
+ }
13095
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
13096
+ new_type = GGML_TYPE_IQ3_S;
13097
+ }
11995
13098
  }
11996
13099
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
11997
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13100
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
11998
13101
  if (name.find("attn_v.weight") != std::string::npos) {
11999
13102
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
12000
13103
  else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -12013,7 +13116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12013
13116
  if (qs.model.hparams.n_expert == 8) {
12014
13117
  new_type = GGML_TYPE_Q5_K;
12015
13118
  } else {
12016
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
13119
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
12017
13120
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
12018
13121
  }
12019
13122
  }
@@ -12027,13 +13130,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12027
13130
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
12028
13131
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
12029
13132
  }
12030
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
12031
- new_type = GGML_TYPE_Q4_K;
12032
- }
12033
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
12034
- new_type = GGML_TYPE_Q4_K;
12035
- }
12036
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
13133
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
12037
13134
  new_type = GGML_TYPE_Q4_K;
12038
13135
  }
12039
13136
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
@@ -12186,7 +13283,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12186
13283
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
12187
13284
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
12188
13285
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
12189
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
13286
+ new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
13287
+ new_type == GGML_TYPE_IQ1_M) {
12190
13288
  int nx = tensor->ne[0];
12191
13289
  int ny = tensor->ne[1];
12192
13290
  if (nx % QK_K != 0) {
@@ -12204,6 +13302,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12204
13302
  case GGML_TYPE_IQ3_XXS:
12205
13303
  case GGML_TYPE_IQ3_S:
12206
13304
  case GGML_TYPE_IQ1_S:
13305
+ case GGML_TYPE_IQ1_M:
12207
13306
  case GGML_TYPE_Q2_K:
12208
13307
  case GGML_TYPE_Q3_K:
12209
13308
  case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
@@ -12285,6 +13384,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12285
13384
  case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
12286
13385
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
12287
13386
  case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
13387
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
12288
13388
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
12289
13389
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
12290
13390
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
@@ -12307,8 +13407,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12307
13407
  constexpr bool use_mmap = false;
12308
13408
  #endif
12309
13409
 
12310
- llama_model_loader ml(fname_inp, use_mmap, NULL);
12311
- ml.init_mapping(false); // no prefetching?
13410
+ llama_model_kv_override * kv_overrides = nullptr;
13411
+ if (params->kv_overrides) {
13412
+ auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
13413
+ kv_overrides = v->data();
13414
+ }
13415
+ llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
13416
+ ml.init_mappings(false); // no prefetching
12312
13417
 
12313
13418
  llama_model model;
12314
13419
  llm_load_arch(ml, model);
@@ -12332,36 +13437,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12332
13437
  struct gguf_context * ctx_out = gguf_init_empty();
12333
13438
 
12334
13439
  // copy the KV pairs from the input file
12335
- gguf_set_kv (ctx_out, ml.ctx_gguf);
13440
+ gguf_set_kv (ctx_out, ml.meta);
12336
13441
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
12337
13442
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
12338
13443
 
13444
+ if (params->kv_overrides) {
13445
+ const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
13446
+ for (auto & o : overrides) {
13447
+ if (o.key[0] == 0) break;
13448
+ if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
13449
+ gguf_set_val_f32(ctx_out, o.key, o.float_value);
13450
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
13451
+ gguf_set_val_i32(ctx_out, o.key, o.int_value);
13452
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
13453
+ gguf_set_val_bool(ctx_out, o.key, o.bool_value);
13454
+ } else {
13455
+ LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
13456
+ }
13457
+ }
13458
+ }
13459
+
12339
13460
  for (int i = 0; i < ml.n_tensors; ++i) {
12340
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13461
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12341
13462
 
12342
13463
  const std::string name = ggml_get_name(meta);
12343
13464
 
12344
13465
  // TODO: avoid hardcoded tensor names - use the TN_* constants
12345
13466
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
12346
13467
  ++qs.n_attention_wv;
12347
- }
12348
- else if (name.find("ffn_down") != std::string::npos) {
12349
- ++qs.n_ffn_down;
12350
- }
12351
- else if (name.find("ffn_gate") != std::string::npos) {
12352
- ++qs.n_ffn_gate;
12353
- }
12354
- else if (name.find("ffn_up") != std::string::npos) {
12355
- ++qs.n_ffn_up;
12356
- }
12357
- else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13468
+ } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
12358
13469
  qs.has_output = true;
12359
13470
  }
12360
13471
  }
12361
- if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
12362
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
12363
- __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
12364
- }
13472
+
13473
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13474
+
13475
+ // sanity checks
13476
+ GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
12365
13477
 
12366
13478
  size_t total_size_org = 0;
12367
13479
  size_t total_size_new = 0;
@@ -12377,7 +13489,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12377
13489
 
12378
13490
  // populate the original tensors so we get an initial meta data
12379
13491
  for (int i = 0; i < ml.n_tensors; ++i) {
12380
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13492
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12381
13493
  gguf_add_tensor(ctx_out, meta);
12382
13494
  }
12383
13495
 
@@ -12391,6 +13503,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12391
13503
  // placeholder for the meta data
12392
13504
  ::zeros(fout, meta_size);
12393
13505
 
13506
+ const auto tn = LLM_TN(model.arch);
13507
+
12394
13508
  for (int i = 0; i < ml.n_tensors; ++i) {
12395
13509
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
12396
13510
 
@@ -12413,8 +13527,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12413
13527
  // This used to be a regex, but <regex> has an extreme cost to compile times.
12414
13528
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
12415
13529
 
12416
- // quantize only 2D tensors
12417
- quantize &= (ggml_n_dims(tensor) == 2);
13530
+ // quantize only 2D and 3D tensors (experts)
13531
+ quantize &= (ggml_n_dims(tensor) >= 2);
12418
13532
  quantize &= params->quantize_output_tensor || name != "output.weight";
12419
13533
  quantize &= !params->only_copy;
12420
13534
 
@@ -12443,6 +13557,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12443
13557
  if (!params->pure && ggml_is_quantized(default_type)) {
12444
13558
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
12445
13559
  }
13560
+ else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13561
+ new_type = params->token_embedding_type;
13562
+ }
13563
+ else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13564
+ new_type = params->output_tensor_type;
13565
+ }
12446
13566
 
12447
13567
  // If we've decided to quantize to the same type the tensor is already
12448
13568
  // in then there's nothing to do.
@@ -12463,11 +13583,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12463
13583
  if (it == imatrix_data->end()) {
12464
13584
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
12465
13585
  } else {
12466
- if (it->second.size() == (size_t)tensor->ne[0]) {
13586
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
12467
13587
  imatrix = it->second.data();
12468
13588
  } else {
12469
13589
  LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
12470
- int(it->second.size()), int(tensor->ne[0]), tensor->name);
13590
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
13591
+
13592
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
13593
+ // this is a significant error and it may be good idea to abort the process if this happens,
13594
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
13595
+ // tok_embd should be ignored in this case, since it always causes this warning
13596
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
13597
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
13598
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
13599
+ }
12471
13600
  }
12472
13601
  }
12473
13602
  }
@@ -12475,6 +13604,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12475
13604
  new_type == GGML_TYPE_IQ2_XS ||
12476
13605
  new_type == GGML_TYPE_IQ2_S ||
12477
13606
  new_type == GGML_TYPE_IQ1_S ||
13607
+ (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
12478
13608
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
12479
13609
  LLAMA_LOG_ERROR("\n\n============================================================\n");
12480
13610
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -12503,15 +13633,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12503
13633
  new_data = work.data();
12504
13634
 
12505
13635
  const int n_per_row = tensor->ne[0];
12506
- const int nrows = nelements / n_per_row;
13636
+ const int nrows = tensor->ne[1];
12507
13637
 
12508
13638
  static const int min_chunk_size = 32 * 512;
12509
13639
  const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
12510
13640
 
12511
- const int nchunk = (nelements + chunk_size - 1)/chunk_size;
13641
+ const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13642
+ const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
12512
13643
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
12513
- new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
12514
13644
 
13645
+ // quantize each expert separately since they have different importance matrices
13646
+ new_size = 0;
13647
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
13648
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
13649
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
13650
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
13651
+
13652
+ new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
13653
+ }
12515
13654
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
12516
13655
  }
12517
13656
  total_size_org += ggml_nbytes(tensor);
@@ -12582,7 +13721,7 @@ static int llama_apply_lora_from_file_internal(
12582
13721
  if (path_base_model) {
12583
13722
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
12584
13723
  ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
12585
- ml->init_mapping(/*prefetch*/ false); // no prefetching
13724
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
12586
13725
  }
12587
13726
 
12588
13727
  struct tensor_meta {
@@ -12703,7 +13842,7 @@ static int llama_apply_lora_from_file_internal(
12703
13842
 
12704
13843
  ggml_tensor * base_t;
12705
13844
  if (ml) {
12706
- if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
13845
+ if (!ml->get_tensor_meta(base_name.c_str())) {
12707
13846
  LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
12708
13847
  return 1;
12709
13848
  }
@@ -12887,11 +14026,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12887
14026
  struct llama_model_quantize_params result = {
12888
14027
  /*.nthread =*/ 0,
12889
14028
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
14029
+ /*.output_tensor_type =*/ GGML_TYPE_COUNT,
14030
+ /*.token_embedding_type =*/ GGML_TYPE_COUNT,
12890
14031
  /*.allow_requantize =*/ false,
12891
14032
  /*.quantize_output_tensor =*/ true,
12892
14033
  /*.only_copy =*/ false,
12893
14034
  /*.pure =*/ false,
12894
14035
  /*.imatrix =*/ nullptr,
14036
+ /*.kv_overrides =*/ nullptr,
12895
14037
  };
12896
14038
 
12897
14039
  return result;
@@ -12900,7 +14042,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12900
14042
  size_t llama_max_devices(void) {
12901
14043
  #if defined(GGML_USE_METAL)
12902
14044
  return 1;
12903
- #elif defined(GGML_USE_CUBLAS)
14045
+ #elif defined(GGML_USE_CUDA)
12904
14046
  return GGML_CUDA_MAX_DEVICES;
12905
14047
  #elif defined(GGML_USE_SYCL)
12906
14048
  return GGML_SYCL_MAX_DEVICES;
@@ -12920,8 +14062,8 @@ bool llama_supports_mlock(void) {
12920
14062
  }
12921
14063
 
12922
14064
  bool llama_supports_gpu_offload(void) {
12923
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
12924
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
14065
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
14066
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
12925
14067
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
12926
14068
  return true;
12927
14069
  #else
@@ -13028,7 +14170,7 @@ struct llama_context * llama_new_context_with_model(
13028
14170
  const auto & hparams = model->hparams;
13029
14171
  auto & cparams = ctx->cparams;
13030
14172
 
13031
- // TODO: maybe add n_seq_max here too
14173
+ cparams.n_seq_max = std::max(1u, params.n_seq_max);
13032
14174
  cparams.n_threads = params.n_threads;
13033
14175
  cparams.n_threads_batch = params.n_threads_batch;
13034
14176
  cparams.yarn_ext_factor = params.yarn_ext_factor;
@@ -13126,7 +14268,7 @@ struct llama_context * llama_new_context_with_model(
13126
14268
  }
13127
14269
  ctx->backends.push_back(ctx->backend_metal);
13128
14270
  }
13129
- #elif defined(GGML_USE_CUBLAS)
14271
+ #elif defined(GGML_USE_CUDA)
13130
14272
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
13131
14273
  // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
13132
14274
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
@@ -13149,7 +14291,20 @@ struct llama_context * llama_new_context_with_model(
13149
14291
  }
13150
14292
  }
13151
14293
  #elif defined(GGML_USE_VULKAN)
13152
- if (model->n_gpu_layers > 0) {
14294
+ if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14295
+ LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
14296
+ llama_free(ctx);
14297
+ return nullptr;
14298
+ }
14299
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
14300
+ ggml_backend_t backend = ggml_backend_vk_init(0);
14301
+ if (backend == nullptr) {
14302
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
14303
+ llama_free(ctx);
14304
+ return nullptr;
14305
+ }
14306
+ ctx->backends.push_back(backend);
14307
+ } else {
13153
14308
  for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
13154
14309
  ggml_backend_t backend = ggml_backend_vk_init(device);
13155
14310
  if (backend == nullptr) {
@@ -13161,30 +14316,28 @@ struct llama_context * llama_new_context_with_model(
13161
14316
  }
13162
14317
  }
13163
14318
  #elif defined(GGML_USE_SYCL)
13164
- if (model->n_gpu_layers > 0) {
13165
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
13166
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
13167
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
14319
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
14320
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14321
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
14322
+ if (backend == nullptr) {
14323
+ int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
14324
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
14325
+ llama_free(ctx);
14326
+ return nullptr;
14327
+ }
14328
+ ctx->backends.push_back(backend);
14329
+ } else {
14330
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
14331
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
14332
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
13168
14333
  if (backend == nullptr) {
13169
- int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
13170
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
14334
+ int id_list[GGML_SYCL_MAX_DEVICES];
14335
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
14336
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
13171
14337
  llama_free(ctx);
13172
14338
  return nullptr;
13173
14339
  }
13174
14340
  ctx->backends.push_back(backend);
13175
- } else {
13176
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
13177
- for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
13178
- ggml_backend_t backend = ggml_backend_sycl_init(i);
13179
- if (backend == nullptr) {
13180
- int id_list[GGML_SYCL_MAX_DEVICES];
13181
- ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
13182
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
13183
- llama_free(ctx);
13184
- return nullptr;
13185
- }
13186
- ctx->backends.push_back(backend);
13187
- }
13188
14341
  }
13189
14342
  }
13190
14343
  #elif defined(GGML_USE_KOMPUTE)
@@ -13232,25 +14385,12 @@ struct llama_context * llama_new_context_with_model(
13232
14385
 
13233
14386
  // graph outputs buffer
13234
14387
  {
13235
- // resized during inference, reserve maximum
13236
- ctx->logits_size = hparams.n_vocab*cparams.n_batch;
13237
- ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
13238
-
13239
- const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
13240
-
13241
- ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
13242
- if (ctx->buf_output == nullptr) {
13243
- LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
14388
+ // resized during inference when a batch uses more outputs
14389
+ if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
14390
+ LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
13244
14391
  llama_free(ctx);
13245
14392
  return nullptr;
13246
14393
  }
13247
- ggml_backend_buffer_clear(ctx->buf_output, 0);
13248
-
13249
-
13250
- ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
13251
- if (params.embeddings) {
13252
- ctx->embd = ctx->logits + ctx->logits_size;
13253
- }
13254
14394
 
13255
14395
  LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
13256
14396
  ggml_backend_buffer_name(ctx->buf_output),
@@ -13275,7 +14415,7 @@ struct llama_context * llama_new_context_with_model(
13275
14415
 
13276
14416
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
13277
14417
  bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
13278
- #ifndef GGML_USE_CUBLAS
14418
+ #ifndef GGML_USE_CUDA
13279
14419
  // pipeline parallelism requires support for async compute and events
13280
14420
  // currently this is only implemented in the CUDA backend
13281
14421
  pipeline_parallel = false;
@@ -13383,11 +14523,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
13383
14523
  case LLM_ARCH_ORION:
13384
14524
  case LLM_ARCH_INTERNLM2:
13385
14525
  case LLM_ARCH_MINICPM:
14526
+ case LLM_ARCH_XVERSE:
13386
14527
  case LLM_ARCH_COMMAND_R:
13387
14528
  return LLAMA_ROPE_TYPE_NORM;
13388
14529
 
13389
14530
  // the pairs of head values are offset by n_rot/2
13390
14531
  case LLM_ARCH_FALCON:
14532
+ case LLM_ARCH_GROK:
13391
14533
  case LLM_ARCH_PERSIMMON:
13392
14534
  case LLM_ARCH_BERT:
13393
14535
  case LLM_ARCH_NOMIC_BERT:
@@ -13766,27 +14908,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
13766
14908
 
13767
14909
  // Returns the *maximum* size of the state
13768
14910
  size_t llama_get_state_size(const struct llama_context * ctx) {
14911
+ const auto & cparams = ctx->cparams;
14912
+ const auto & hparams = ctx->model.hparams;
14913
+
13769
14914
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
13770
14915
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
13771
14916
  const size_t s_rng_size = sizeof(size_t);
13772
14917
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
14918
+ const size_t s_n_outputs = sizeof(size_t);
14919
+ // assume worst case for outputs although only currently set ones are serialized
14920
+ const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
13773
14921
  const size_t s_logits_size = sizeof(size_t);
13774
- // assume worst case for logits although only currently set ones are serialized
13775
- const size_t s_logits = ctx->logits_size * sizeof(float);
14922
+ const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
13776
14923
  const size_t s_embedding_size = sizeof(size_t);
13777
- const size_t s_embedding = ctx->embd_size * sizeof(float);
14924
+ const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
13778
14925
  const size_t s_kv_buf_size = sizeof(size_t);
13779
14926
  const size_t s_kv_head = sizeof(uint32_t);
13780
14927
  const size_t s_kv_size = sizeof(uint32_t);
13781
14928
  const size_t s_kv_used = sizeof(uint32_t);
13782
14929
  const size_t s_kv = ctx->kv_self.total_size();
13783
- // TODO: assume the max is more than 1 seq_id per KV cell
13784
- const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
14930
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
13785
14931
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
13786
14932
 
13787
14933
  const size_t s_total = (
13788
14934
  + s_rng_size
13789
14935
  + s_rng
14936
+ + s_n_outputs
14937
+ + s_output_pos
13790
14938
  + s_logits_size
13791
14939
  + s_logits
13792
14940
  + s_embedding_size
@@ -13861,7 +15009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13861
15009
  std::ostringstream rng_ss;
13862
15010
  rng_ss << ctx->rng;
13863
15011
 
13864
- const std::string & rng_str = rng_ss.str();
15012
+ const std::string & rng_str = rng_ss.str();
13865
15013
  const size_t rng_size = rng_str.size();
13866
15014
 
13867
15015
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
@@ -13870,25 +15018,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13870
15018
  data_ctx->write(rng_str.data(), rng_size);
13871
15019
  }
13872
15020
 
13873
- // copy logits
15021
+ // copy outputs
13874
15022
  {
13875
- const size_t logits_size = ctx->logits_size;
15023
+ // Can't use ctx->n_outputs because it's not for the
15024
+ // entire last batch when n_ubatch is smaller than n_batch
15025
+ size_t n_outputs = 0;
13876
15026
 
13877
- data_ctx->write(&logits_size, sizeof(logits_size));
15027
+ // copy output ids
15028
+ {
15029
+ std::vector<int32_t> output_pos;
13878
15030
 
13879
- if (logits_size) {
13880
- data_ctx->write(ctx->logits, logits_size * sizeof(float));
15031
+ const size_t n_batch = ctx->cparams.n_batch;
15032
+ const auto & output_ids = ctx->output_ids;
15033
+
15034
+ output_pos.resize(ctx->output_size);
15035
+
15036
+ // build a more compact representation of the output ids
15037
+ for (size_t i = 0; i < n_batch; ++i) {
15038
+ // map an output id to a position in the batch
15039
+ int32_t pos = output_ids[i];
15040
+ if (pos >= 0) {
15041
+ if ((size_t) pos >= n_outputs) {
15042
+ n_outputs = pos + 1;
15043
+ }
15044
+ GGML_ASSERT((size_t) pos < ctx->output_size);
15045
+ output_pos[pos] = i;
15046
+ }
15047
+ }
15048
+
15049
+ data_ctx->write(&n_outputs, sizeof(n_outputs));
15050
+
15051
+ if (n_outputs) {
15052
+ data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
15053
+ }
13881
15054
  }
13882
- }
13883
15055
 
13884
- // copy embeddings
13885
- {
13886
- const size_t embeddings_size = ctx->embd_size;
15056
+ // copy logits
15057
+ {
15058
+ const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
13887
15059
 
13888
- data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15060
+ data_ctx->write(&logits_size, sizeof(logits_size));
13889
15061
 
13890
- if (embeddings_size) {
13891
- data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15062
+ if (logits_size) {
15063
+ data_ctx->write(ctx->logits, logits_size * sizeof(float));
15064
+ }
15065
+ }
15066
+
15067
+ // copy embeddings
15068
+ {
15069
+ const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
15070
+
15071
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15072
+
15073
+ if (embeddings_size) {
15074
+ data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15075
+ }
13892
15076
  }
13893
15077
  }
13894
15078
 
@@ -13901,9 +15085,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13901
15085
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13902
15086
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
13903
15087
 
13904
- const size_t kv_buf_size = kv_self.total_size();
15088
+ // NOTE: kv_size and kv_buf_size are mostly used for sanity checks
13905
15089
  const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
13906
15090
  const uint32_t kv_size = kv_self.size;
15091
+ const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
13907
15092
  const uint32_t kv_used = kv_self.used;
13908
15093
 
13909
15094
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
@@ -13912,6 +15097,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13912
15097
  data_ctx->write(&kv_used, sizeof(kv_used));
13913
15098
 
13914
15099
  if (kv_buf_size) {
15100
+ const size_t pre_kv_buf_size = data_ctx->get_size_written();
15101
+
13915
15102
  std::vector<uint8_t> tmp_buf;
13916
15103
  for (int il = 0; il < (int) n_layer; ++il) {
13917
15104
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -13941,6 +15128,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13941
15128
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
13942
15129
  }
13943
15130
  }
15131
+ GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
13944
15132
  }
13945
15133
 
13946
15134
  for (uint32_t i = 0; i < kv_head; ++i) {
@@ -13985,6 +15173,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13985
15173
  GGML_ASSERT(!rng_ss.fail());
13986
15174
  }
13987
15175
 
15176
+ // set output ids
15177
+ {
15178
+ size_t n_outputs;
15179
+ std::vector<int32_t> output_pos;
15180
+
15181
+ memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
15182
+
15183
+ GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
15184
+
15185
+ if (n_outputs) {
15186
+ output_pos.resize(n_outputs);
15187
+ memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
15188
+ inp += n_outputs * sizeof(int32_t);
15189
+
15190
+ for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
15191
+ int32_t id = output_pos[i];
15192
+ GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15193
+ ctx->output_ids[id] = i;
15194
+ }
15195
+ }
15196
+ }
15197
+
13988
15198
  // set logits
13989
15199
  {
13990
15200
  size_t logits_size;
@@ -14005,7 +15215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14005
15215
 
14006
15216
  memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
14007
15217
 
14008
- GGML_ASSERT(ctx->embd_size == embeddings_size);
15218
+ GGML_ASSERT(ctx->embd_size >= embeddings_size);
14009
15219
 
14010
15220
  if (embeddings_size) {
14011
15221
  memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
@@ -14032,8 +15242,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14032
15242
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
14033
15243
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
14034
15244
 
15245
+ if (kv_self.size != kv_size) {
15246
+ // the KV cache needs to be big enough to load all the KV cells from the saved state
15247
+ GGML_ASSERT(kv_self.size >= kv_head);
15248
+
15249
+ LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
15250
+ __func__, kv_head, kv_size, kv_self.size);
15251
+ }
15252
+
14035
15253
  if (kv_buf_size) {
14036
- GGML_ASSERT(kv_self.total_size() == kv_buf_size);
15254
+ const size_t pre_kv_buf_size = inp - src;
15255
+
15256
+ GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
14037
15257
 
14038
15258
  for (int il = 0; il < (int) n_layer; ++il) {
14039
15259
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -14053,23 +15273,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14053
15273
 
14054
15274
  // v is not contiguous, copy row by row
14055
15275
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
14056
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
15276
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
14057
15277
 
14058
15278
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
14059
15279
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
14060
15280
  inp += v_row_size;
14061
15281
  }
14062
15282
  }
15283
+ GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
14063
15284
  }
14064
15285
 
14065
- GGML_ASSERT(kv_self.size == kv_size);
15286
+ llama_kv_cache_clear(ctx);
14066
15287
 
14067
15288
  ctx->kv_self.head = kv_head;
14068
- ctx->kv_self.size = kv_size;
14069
15289
  ctx->kv_self.used = kv_used;
14070
15290
 
14071
- ctx->kv_self.cells.resize(kv_size);
14072
-
14073
15291
  for (uint32_t i = 0; i < kv_head; ++i) {
14074
15292
  llama_pos pos;
14075
15293
  size_t seq_id_size;
@@ -14086,11 +15304,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14086
15304
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
14087
15305
  }
14088
15306
  }
14089
-
14090
- for (uint32_t i = kv_head; i < kv_size; ++i) {
14091
- ctx->kv_self.cells[i].pos = -1;
14092
- ctx->kv_self.cells[i].seq_id.clear();
14093
- }
14094
15307
  }
14095
15308
 
14096
15309
  const size_t nread = inp - src;
@@ -14296,11 +15509,33 @@ float * llama_get_logits(struct llama_context * ctx) {
14296
15509
  }
14297
15510
 
14298
15511
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
14299
- assert(ctx->logits_valid.at(i));
14300
-
14301
15512
  llama_synchronize(ctx);
14302
15513
 
14303
- return ctx->logits + i*ctx->model.hparams.n_vocab;
15514
+ try {
15515
+ if (ctx->logits == nullptr) {
15516
+ throw std::runtime_error("no logits");
15517
+ }
15518
+ if ((size_t) i >= ctx->output_ids.size()) {
15519
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
15520
+ }
15521
+ const int32_t j = ctx->output_ids[i];
15522
+
15523
+ if (j < 0) {
15524
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
15525
+ }
15526
+ if ((size_t) j >= ctx->output_size) {
15527
+ // This should not happen
15528
+ throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
15529
+ }
15530
+
15531
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
15532
+ } catch (const std::exception & err) {
15533
+ LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
15534
+ #ifndef NDEBUG
15535
+ GGML_ASSERT(false);
15536
+ #endif
15537
+ return nullptr;
15538
+ }
14304
15539
  }
14305
15540
 
14306
15541
  float * llama_get_embeddings(struct llama_context * ctx) {
@@ -14312,7 +15547,31 @@ float * llama_get_embeddings(struct llama_context * ctx) {
14312
15547
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
14313
15548
  llama_synchronize(ctx);
14314
15549
 
14315
- return ctx->embd + i*ctx->model.hparams.n_embd;
15550
+ try {
15551
+ if (ctx->embd == nullptr) {
15552
+ throw std::runtime_error("no embeddings");
15553
+ }
15554
+ if ((size_t) i >= ctx->output_ids.size()) {
15555
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
15556
+ }
15557
+ const int32_t j = ctx->output_ids[i];
15558
+
15559
+ if (j < 0) {
15560
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
15561
+ }
15562
+ if ((size_t) j >= ctx->output_size) {
15563
+ // This should not happen
15564
+ throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
15565
+ }
15566
+
15567
+ return ctx->embd + j*ctx->model.hparams.n_embd;
15568
+ } catch (const std::exception & err) {
15569
+ LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
15570
+ #ifndef NDEBUG
15571
+ GGML_ASSERT(false);
15572
+ #endif
15573
+ return nullptr;
15574
+ }
14316
15575
  }
14317
15576
 
14318
15577
  float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
@@ -14602,6 +15861,55 @@ static int32_t llama_chat_apply_template_internal(
14602
15861
  ss << message->content << "</s>";
14603
15862
  }
14604
15863
  }
15864
+ } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
15865
+ // openchat/openchat-3.5-0106,
15866
+ for (auto message : chat) {
15867
+ std::string role(message->role);
15868
+ if (role == "system") {
15869
+ ss << message->content << "<|end_of_turn|>";
15870
+ } else {
15871
+ role[0] = toupper(role[0]);
15872
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
15873
+ }
15874
+ }
15875
+ if (add_ass) {
15876
+ ss << "GPT4 Correct Assistant:";
15877
+ }
15878
+ } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
15879
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
15880
+ for (auto message : chat) {
15881
+ std::string role(message->role);
15882
+ if (role == "system") {
15883
+ // Orca-Vicuna variant uses a system prefix
15884
+ if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
15885
+ ss << "SYSTEM: " << message->content << "\n";
15886
+ } else {
15887
+ ss << message->content << "\n\n";
15888
+ }
15889
+ } else if (role == "user") {
15890
+ ss << "USER: " << message->content << "\n";
15891
+ } else if (role == "assistant") {
15892
+ ss << "ASSISTANT: " << message->content << "</s>\n";
15893
+ }
15894
+ }
15895
+ if (add_ass) {
15896
+ ss << "ASSISTANT:";
15897
+ }
15898
+ } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
15899
+ // deepseek-ai/deepseek-coder-33b-instruct
15900
+ for (auto message : chat) {
15901
+ std::string role(message->role);
15902
+ if (role == "system") {
15903
+ ss << message->content;
15904
+ } else if (role == "user") {
15905
+ ss << "### Instruction:\n" << message->content << "\n";
15906
+ } else if (role == "assistant") {
15907
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
15908
+ }
15909
+ }
15910
+ if (add_ass) {
15911
+ ss << "### Response:\n";
15912
+ }
14605
15913
  } else {
14606
15914
  // template not supported
14607
15915
  return -1;
@@ -14651,6 +15959,30 @@ LLAMA_API int32_t llama_chat_apply_template(
14651
15959
  return res;
14652
15960
  }
14653
15961
 
15962
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
15963
+ static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
15964
+ if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
15965
+ return strlen(split_path);
15966
+ }
15967
+ return 0;
15968
+ }
15969
+
15970
+ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
15971
+ std::string str_split_path(split_path);
15972
+ char postfix[32];
15973
+ snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
15974
+ std::string str_postfix(postfix);
15975
+
15976
+ // check if dest ends with postfix
15977
+ int size_prefix = str_split_path.size() - str_postfix.size();
15978
+ if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
15979
+ snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
15980
+ return size_prefix;
15981
+ }
15982
+
15983
+ return 0;
15984
+ }
15985
+
14654
15986
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
14655
15987
  struct llama_timings result = {
14656
15988
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,