llama_cpp 0.14.3 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,7 +7,7 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
- #ifdef GGML_USE_CUBLAS
10
+ #ifdef GGML_USE_CUDA
11
11
  # include "ggml-cuda.h"
12
12
  #elif defined(GGML_USE_CLBLAST)
13
13
  # include "ggml-opencl.h"
@@ -52,12 +52,16 @@
52
52
  #define NOMINMAX
53
53
  #endif
54
54
  #include <windows.h>
55
+ #ifndef PATH_MAX
56
+ #define PATH_MAX MAX_PATH
57
+ #endif
55
58
  #include <io.h>
56
59
  #endif
57
60
 
58
61
  #include <algorithm>
59
62
  #include <array>
60
63
  #include <cassert>
64
+ #include <cctype>
61
65
  #include <cfloat>
62
66
  #include <cinttypes>
63
67
  #include <climits>
@@ -68,7 +72,6 @@
68
72
  #include <cstdio>
69
73
  #include <cstring>
70
74
  #include <ctime>
71
- #include <cwctype>
72
75
  #include <forward_list>
73
76
  #include <fstream>
74
77
  #include <functional>
@@ -192,6 +195,7 @@ enum llm_arch {
192
195
  LLM_ARCH_LLAMA,
193
196
  LLM_ARCH_FALCON,
194
197
  LLM_ARCH_BAICHUAN,
198
+ LLM_ARCH_GROK,
195
199
  LLM_ARCH_GPT2,
196
200
  LLM_ARCH_GPTJ,
197
201
  LLM_ARCH_GPTNEOX,
@@ -214,6 +218,7 @@ enum llm_arch {
214
218
  LLM_ARCH_GEMMA,
215
219
  LLM_ARCH_STARCODER2,
216
220
  LLM_ARCH_MAMBA,
221
+ LLM_ARCH_XVERSE,
217
222
  LLM_ARCH_COMMAND_R,
218
223
  LLM_ARCH_UNKNOWN,
219
224
  };
@@ -221,6 +226,7 @@ enum llm_arch {
221
226
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
222
227
  { LLM_ARCH_LLAMA, "llama" },
223
228
  { LLM_ARCH_FALCON, "falcon" },
229
+ { LLM_ARCH_GROK, "grok" },
224
230
  { LLM_ARCH_GPT2, "gpt2" },
225
231
  { LLM_ARCH_GPTJ, "gptj" },
226
232
  { LLM_ARCH_GPTNEOX, "gptneox" },
@@ -244,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
244
250
  { LLM_ARCH_GEMMA, "gemma" },
245
251
  { LLM_ARCH_STARCODER2, "starcoder2" },
246
252
  { LLM_ARCH_MAMBA, "mamba" },
253
+ { LLM_ARCH_XVERSE, "xverse" },
247
254
  { LLM_ARCH_COMMAND_R, "command-r" },
248
255
  { LLM_ARCH_UNKNOWN, "(unknown)" },
249
256
  };
@@ -290,6 +297,10 @@ enum llm_kv {
290
297
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
291
298
  LLM_KV_ROPE_SCALING_FINETUNED,
292
299
 
300
+ LLM_KV_SPLIT_NO,
301
+ LLM_KV_SPLIT_COUNT,
302
+ LLM_KV_SPLIT_TENSORS_COUNT,
303
+
293
304
  LLM_KV_SSM_INNER_SIZE,
294
305
  LLM_KV_SSM_CONV_KERNEL,
295
306
  LLM_KV_SSM_STATE_SIZE,
@@ -355,6 +366,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
355
366
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
356
367
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
357
368
 
369
+ { LLM_KV_SPLIT_NO, "split.no" },
370
+ { LLM_KV_SPLIT_COUNT, "split.count" },
371
+ { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
372
+
358
373
  { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
359
374
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
360
375
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
@@ -411,9 +426,12 @@ enum llm_tensor {
411
426
  LLM_TENSOR_FFN_DOWN,
412
427
  LLM_TENSOR_FFN_UP,
413
428
  LLM_TENSOR_FFN_ACT,
414
- LLM_TENSOR_FFN_DOWN_EXP,
429
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
415
430
  LLM_TENSOR_FFN_GATE_EXP,
416
431
  LLM_TENSOR_FFN_UP_EXP,
432
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
433
+ LLM_TENSOR_FFN_GATE_EXPS,
434
+ LLM_TENSOR_FFN_UP_EXPS,
417
435
  LLM_TENSOR_ATTN_Q_NORM,
418
436
  LLM_TENSOR_ATTN_K_NORM,
419
437
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -448,6 +466,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
448
466
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
449
467
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
450
468
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
469
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
470
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
471
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
451
472
  },
452
473
  },
453
474
  {
@@ -483,6 +504,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
483
504
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
484
505
  },
485
506
  },
507
+ {
508
+ LLM_ARCH_GROK,
509
+ {
510
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
511
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
512
+ { LLM_TENSOR_OUTPUT, "output" },
513
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
514
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
515
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
516
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
517
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
518
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
519
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
520
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
521
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
522
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
523
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
524
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
525
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
526
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
527
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
528
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
529
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
530
+ },
531
+ },
486
532
  {
487
533
  LLM_ARCH_GPT2,
488
534
  {
@@ -548,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
548
594
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
549
595
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
550
596
  { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
597
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
598
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
599
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
551
600
  },
552
601
  },
553
602
  {
@@ -843,6 +892,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
843
892
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
844
893
  },
845
894
  },
895
+ {
896
+ LLM_ARCH_XVERSE,
897
+ {
898
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
899
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
900
+ { LLM_TENSOR_OUTPUT, "output" },
901
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
902
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
903
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
904
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
905
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
906
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
907
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
908
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
909
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
910
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
911
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
912
+ },
913
+ },
846
914
  {
847
915
  LLM_ARCH_COMMAND_R,
848
916
  {
@@ -1030,7 +1098,7 @@ struct llama_file {
1030
1098
  size_t size;
1031
1099
 
1032
1100
  llama_file(const char * fname, const char * mode) {
1033
- fp = std::fopen(fname, mode);
1101
+ fp = ggml_fopen(fname, mode);
1034
1102
  if (fp == NULL) {
1035
1103
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1036
1104
  }
@@ -1099,6 +1167,7 @@ struct llama_file {
1099
1167
  }
1100
1168
  }
1101
1169
  };
1170
+ using llama_files = std::vector<std::unique_ptr<llama_file>>;
1102
1171
 
1103
1172
  struct llama_mmap {
1104
1173
  void * addr;
@@ -1299,6 +1368,7 @@ struct llama_mmap {
1299
1368
  }
1300
1369
  #endif
1301
1370
  };
1371
+ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
1302
1372
 
1303
1373
  // Represents some region of memory being locked using mlock or VirtualLock;
1304
1374
  // will automatically unlock on destruction.
@@ -1448,6 +1518,7 @@ struct llama_mlock {
1448
1518
  static void raw_unlock(const void * addr, size_t len) {}
1449
1519
  #endif
1450
1520
  };
1521
+ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1451
1522
 
1452
1523
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1453
1524
  std::vector<char> result(8, 0);
@@ -1467,7 +1538,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1467
1538
  static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
1468
1539
  ggml_backend_buffer_type_t buft = nullptr;
1469
1540
 
1470
- #if defined(GGML_USE_CUBLAS)
1541
+ #if defined(GGML_USE_CUDA)
1471
1542
  // host buffers should only be used when data is expected to be copied to/from the GPU
1472
1543
  if (host_buffer) {
1473
1544
  buft = ggml_backend_cuda_host_buffer_type();
@@ -1497,7 +1568,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1497
1568
 
1498
1569
  #ifdef GGML_USE_METAL
1499
1570
  buft = ggml_backend_metal_buffer_type();
1500
- #elif defined(GGML_USE_CUBLAS)
1571
+ #elif defined(GGML_USE_CUDA)
1501
1572
  buft = ggml_backend_cuda_buffer_type(gpu);
1502
1573
  #elif defined(GGML_USE_VULKAN)
1503
1574
  buft = ggml_backend_vk_buffer_type(gpu);
@@ -1523,7 +1594,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1523
1594
  static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1524
1595
  ggml_backend_buffer_type_t buft = nullptr;
1525
1596
 
1526
- #ifdef GGML_USE_CUBLAS
1597
+ #ifdef GGML_USE_CUDA
1527
1598
  if (ggml_backend_cuda_get_device_count() > 1) {
1528
1599
  buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1529
1600
  }
@@ -1544,7 +1615,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1544
1615
  }
1545
1616
 
1546
1617
  static size_t llama_get_device_count() {
1547
- #if defined(GGML_USE_CUBLAS)
1618
+ #if defined(GGML_USE_CUDA)
1548
1619
  return ggml_backend_cuda_get_device_count();
1549
1620
  #elif defined(GGML_USE_SYCL)
1550
1621
  return ggml_backend_sycl_get_device_count();
@@ -1556,7 +1627,7 @@ static size_t llama_get_device_count() {
1556
1627
  }
1557
1628
 
1558
1629
  static size_t llama_get_device_memory(int device) {
1559
- #if defined(GGML_USE_CUBLAS)
1630
+ #if defined(GGML_USE_CUDA)
1560
1631
  size_t total;
1561
1632
  size_t free;
1562
1633
  ggml_backend_cuda_get_device_memory(device, &total, &free);
@@ -1621,6 +1692,7 @@ enum e_model {
1621
1692
  MODEL_40B,
1622
1693
  MODEL_65B,
1623
1694
  MODEL_70B,
1695
+ MODEL_314B,
1624
1696
  MODEL_SMALL,
1625
1697
  MODEL_MEDIUM,
1626
1698
  MODEL_LARGE,
@@ -1738,6 +1810,7 @@ struct llama_cparams {
1738
1810
  uint32_t n_ctx; // context size used during inference
1739
1811
  uint32_t n_batch;
1740
1812
  uint32_t n_ubatch;
1813
+ uint32_t n_seq_max;
1741
1814
  uint32_t n_threads; // number of threads to use for generation
1742
1815
  uint32_t n_threads_batch; // number of threads to use for batch processing
1743
1816
 
@@ -1803,9 +1876,9 @@ struct llama_layer {
1803
1876
 
1804
1877
  // ff MoE
1805
1878
  struct ggml_tensor * ffn_gate_inp;
1806
- struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1807
- struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1808
- struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1879
+ struct ggml_tensor * ffn_gate_exps;
1880
+ struct ggml_tensor * ffn_down_exps;
1881
+ struct ggml_tensor * ffn_up_exps ;
1809
1882
 
1810
1883
  // ff bias
1811
1884
  struct ggml_tensor * ffn_down_b; // b2
@@ -2023,12 +2096,12 @@ struct llama_model {
2023
2096
  // the model memory buffers for the tensor data
2024
2097
  std::vector<ggml_backend_buffer_t> bufs;
2025
2098
 
2026
- // model memory mapped file
2027
- std::unique_ptr<llama_mmap> mapping;
2099
+ // model memory mapped files
2100
+ llama_mmaps mappings;
2028
2101
 
2029
2102
  // objects representing data potentially being locked in memory
2030
- std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
2031
- llama_mlock mlock_mmap;
2103
+ llama_mlocks mlock_bufs;
2104
+ llama_mlocks mlock_mmaps;
2032
2105
 
2033
2106
  // for quantize-stats only
2034
2107
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -2041,7 +2114,7 @@ struct llama_model {
2041
2114
  ggml_free(ctx);
2042
2115
  }
2043
2116
  for (ggml_backend_buffer_t buf : bufs) {
2044
- #ifdef GGML_USE_CUBLAS
2117
+ #ifdef GGML_USE_CUDA
2045
2118
  if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
2046
2119
  ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
2047
2120
  }
@@ -2060,10 +2133,6 @@ struct llama_context {
2060
2133
  ggml_backend_free(backend);
2061
2134
  }
2062
2135
 
2063
- #ifdef GGML_USE_VULKAN
2064
- ggml_vk_free_cpu_assist();
2065
- #endif
2066
-
2067
2136
  ggml_backend_buffer_free(buf_output);
2068
2137
  }
2069
2138
 
@@ -2100,20 +2169,20 @@ struct llama_context {
2100
2169
  // host buffer for the model output (logits and embeddings)
2101
2170
  ggml_backend_buffer_t buf_output = nullptr;
2102
2171
 
2103
- // decode output (2-dimensional array: [n_tokens][n_vocab])
2104
- size_t logits_size = 0;
2105
- float * logits = nullptr;
2172
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
2173
+ size_t logits_size = 0; // capacity (of floats) for logits
2174
+ float * logits = nullptr;
2175
+
2176
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2177
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2178
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
2106
2179
 
2107
- #ifndef NDEBUG
2108
- // guard against access to unset logits
2109
- std::vector<bool> logits_valid;
2110
- #endif
2111
2180
  bool logits_all = false;
2112
2181
 
2113
- // embeddings output (2-dimensional array: [n_tokens][n_embd])
2182
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
2114
2183
  // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
2115
- size_t embd_size = 0;
2116
- float * embd = nullptr;
2184
+ size_t embd_size = 0; // capacity (of floats) for embeddings
2185
+ float * embd = nullptr;
2117
2186
 
2118
2187
  // sequence embeddings output (map of [n_embd] vectors)
2119
2188
  // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
@@ -2130,14 +2199,15 @@ struct llama_context {
2130
2199
  struct ggml_tensor * inp_tokens; // I32 [n_batch]
2131
2200
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
2132
2201
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2202
+ struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2133
2203
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2134
- struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
2204
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2135
2205
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2136
2206
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2137
2207
  struct ggml_tensor * inp_cls; // I32 [n_batch]
2138
2208
  struct ggml_tensor * inp_s_copy; // I32 [kv_size]
2139
- struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
2140
- struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
2209
+ struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
2210
+ struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
2141
2211
 
2142
2212
  // control vectors
2143
2213
  struct llama_control_vector cvec;
@@ -2792,6 +2862,8 @@ namespace GGUFMeta {
2792
2862
  };
2793
2863
  }
2794
2864
 
2865
+ using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
2866
+
2795
2867
  struct llama_model_loader {
2796
2868
  int n_kv = 0;
2797
2869
  int n_tensors = 0;
@@ -2802,54 +2874,133 @@ struct llama_model_loader {
2802
2874
 
2803
2875
  bool use_mmap = false;
2804
2876
 
2805
- llama_file file;
2877
+ llama_files files;
2806
2878
  llama_ftype ftype;
2807
2879
  llama_fver fver;
2808
2880
 
2809
- std::unique_ptr<llama_mmap> mapping;
2881
+ llama_mmaps mappings;
2882
+
2883
+ // Holds information on a model weight
2884
+ struct llama_tensor_weight {
2885
+ uint16_t idx; // source file index
2886
+ size_t offs; // tensor data offset in the original file
2887
+
2888
+ ggml_tensor * tensor;
2889
+
2890
+ llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2891
+ const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2892
+ offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
2893
+ }
2894
+ };
2895
+ std::vector<llama_tensor_weight> weights;
2896
+
2810
2897
  std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
2811
2898
 
2812
- struct gguf_context * ctx_gguf = NULL;
2813
- struct ggml_context * ctx_meta = NULL;
2899
+ struct gguf_context * meta = NULL;
2900
+ std::vector<ggml_context *> contexts;
2814
2901
 
2815
2902
  std::string arch_name;
2816
2903
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2817
2904
 
2818
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
2905
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
2819
2906
  int trace = 0;
2820
2907
  if (getenv("LLAMA_TRACE")) {
2821
2908
  trace = atoi(getenv("LLAMA_TRACE"));
2822
2909
  }
2823
2910
 
2824
- struct gguf_init_params params = {
2825
- /*.no_alloc = */ true,
2826
- /*.ctx = */ &ctx_meta,
2827
- };
2828
-
2829
2911
  if (param_overrides_p != nullptr) {
2830
2912
  for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
2831
2913
  kv_overrides.insert({std::string(p->key), *p});
2832
2914
  }
2833
2915
  }
2834
2916
 
2835
- ctx_gguf = gguf_init_from_file(fname.c_str(), params);
2836
- if (!ctx_gguf) {
2917
+ struct ggml_context * ctx = NULL;
2918
+ struct gguf_init_params params = {
2919
+ /*.no_alloc = */ true,
2920
+ /*.ctx = */ &ctx,
2921
+ };
2922
+
2923
+ meta = gguf_init_from_file(fname.c_str(), params);
2924
+ if (!meta) {
2837
2925
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
2838
2926
  }
2839
2927
 
2840
2928
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2841
2929
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2842
2930
 
2843
- n_kv = gguf_get_n_kv(ctx_gguf);
2844
- n_tensors = gguf_get_n_tensors(ctx_gguf);
2931
+ // Save tensors data offset of the main file.
2932
+ // For subsidiary files, `meta` tensor data offset must not be used,
2933
+ // so we build a unified tensors index for weights.
2934
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2935
+ weights.emplace_back(0, cur->name, meta, cur);
2936
+ }
2937
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
2938
+ contexts.emplace_back(ctx);
2939
+
2940
+ uint16_t n_split = 0;
2941
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
2942
+
2943
+ // Load additional GGML contexts
2944
+ if (n_split > 1) {
2945
+ uint16_t idx = 0;
2946
+ get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
2947
+ if (idx != 0) {
2948
+ throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
2949
+ }
2950
+
2951
+ char split_prefix[PATH_MAX] = {0};
2952
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
2953
+ throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
2954
+ }
2955
+
2956
+ if (trace > 0) {
2957
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
2958
+ }
2959
+
2960
+ char split_path[PATH_MAX] = {0};
2961
+ for (idx = 1; idx < n_split; idx++) {
2962
+ llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
2963
+
2964
+ struct gguf_init_params split_params = {
2965
+ /*.no_alloc = */ true,
2966
+ /*.ctx = */ &ctx,
2967
+ };
2968
+ struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
2969
+ if (!ctx_gguf) {
2970
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
2971
+ }
2972
+
2973
+ // Save tensors data offset info of the shard.
2974
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2975
+ weights.emplace_back(idx, cur->name, ctx_gguf, cur);
2976
+ }
2977
+ files.emplace_back(new llama_file(split_path, "rb"));
2978
+ contexts.emplace_back(ctx);
2979
+
2980
+ gguf_free(ctx_gguf);
2981
+ }
2982
+
2983
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
2984
+
2985
+ // sanity check
2986
+ {
2987
+ const int n_tensors_loaded = (int) weights.size();
2988
+ if (n_tensors != n_tensors_loaded) {
2989
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
2990
+ }
2991
+ }
2992
+
2993
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
2994
+ }
2995
+
2996
+ n_kv = gguf_get_n_kv(meta);
2997
+ n_tensors = weights.size();
2845
2998
 
2846
- fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
2999
+ fver = (enum llama_fver) gguf_get_version(meta);
2847
3000
 
2848
- for (int i = 0; i < n_tensors; i++) {
2849
- const char * name = gguf_get_tensor_name(ctx_gguf, i);
2850
- struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
2851
- n_elements += ggml_nelements(t);
2852
- n_bytes += ggml_nbytes(t);
3001
+ for (auto & w : weights) {
3002
+ n_elements += ggml_nelements(w.tensor);
3003
+ n_bytes += ggml_nbytes(w.tensor);
2853
3004
  }
2854
3005
 
2855
3006
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -2864,7 +3015,8 @@ struct llama_model_loader {
2864
3015
  enum ggml_type type_max = GGML_TYPE_F32;
2865
3016
 
2866
3017
  for (int i = 0; i < n_tensors; i++) {
2867
- enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
3018
+ const ggml_tensor * tensor = weights.at(i).tensor;
3019
+ enum ggml_type type = tensor->type;
2868
3020
 
2869
3021
  n_type[type]++;
2870
3022
 
@@ -2874,8 +3026,8 @@ struct llama_model_loader {
2874
3026
  }
2875
3027
 
2876
3028
  if (trace > 0) {
2877
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2878
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
3029
+ const uint16_t sid = weights.at(i).idx;
3030
+ LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
2879
3031
  }
2880
3032
  }
2881
3033
 
@@ -2897,6 +3049,7 @@ struct llama_model_loader {
2897
3049
  case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2898
3050
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2899
3051
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
3052
+ case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
2900
3053
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2901
3054
  case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2902
3055
  case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
@@ -2911,22 +3064,23 @@ struct llama_model_loader {
2911
3064
  ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
2912
3065
 
2913
3066
  {
2914
- const int kid = gguf_find_key(ctx_gguf, "general.file_type");
3067
+ const int kid = gguf_find_key(meta, "general.file_type");
2915
3068
  if (kid >= 0) {
2916
- ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
3069
+ ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
2917
3070
  }
2918
3071
  }
2919
3072
 
2920
3073
  LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
3074
+
2921
3075
  for (int i = 0; i < n_kv; i++) {
2922
- const char * name = gguf_get_key(ctx_gguf, i);
2923
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
3076
+ const char * name = gguf_get_key(meta, i);
3077
+ const enum gguf_type type = gguf_get_kv_type(meta, i);
2924
3078
  const std::string type_name =
2925
3079
  type == GGUF_TYPE_ARRAY
2926
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
3080
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
2927
3081
  : gguf_type_name(type);
2928
3082
 
2929
- std::string value = gguf_kv_to_str(ctx_gguf, i);
3083
+ std::string value = gguf_kv_to_str(meta, i);
2930
3084
  const size_t MAX_VALUE_LEN = 40;
2931
3085
  if (value.size() > MAX_VALUE_LEN) {
2932
3086
  value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -2955,18 +3109,18 @@ struct llama_model_loader {
2955
3109
  }
2956
3110
 
2957
3111
  ~llama_model_loader() {
2958
- if (ctx_gguf) {
2959
- gguf_free(ctx_gguf);
3112
+ if (meta) {
3113
+ gguf_free(meta);
2960
3114
  }
2961
- if (ctx_meta) {
2962
- ggml_free(ctx_meta);
3115
+ for (auto * ctx : contexts) {
3116
+ ggml_free(ctx);
2963
3117
  }
2964
3118
  }
2965
3119
 
2966
3120
  template<typename T>
2967
3121
  typename std::enable_if<std::is_integral<T>::value, bool>::type
2968
3122
  get_arr_n(const std::string & key, T & result, const bool required = true) {
2969
- const int kid = gguf_find_key(ctx_gguf, key.c_str());
3123
+ const int kid = gguf_find_key(meta, key.c_str());
2970
3124
 
2971
3125
  if (kid < 0) {
2972
3126
  if (required) {
@@ -2976,7 +3130,7 @@ struct llama_model_loader {
2976
3130
  }
2977
3131
 
2978
3132
  struct GGUFMeta::ArrayInfo arr_info =
2979
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
3133
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
2980
3134
 
2981
3135
 
2982
3136
  result = arr_info.length;
@@ -2996,7 +3150,7 @@ struct llama_model_loader {
2996
3150
  const struct llama_model_kv_override * override =
2997
3151
  it != kv_overrides.end() ? &it->second : nullptr;
2998
3152
 
2999
- const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
3153
+ const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
3000
3154
 
3001
3155
  if (required && !found) {
3002
3156
  throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -3019,28 +3173,57 @@ struct llama_model_loader {
3019
3173
  }
3020
3174
 
3021
3175
  const char * get_tensor_name(int i) const {
3022
- return gguf_get_tensor_name(ctx_gguf, i);
3176
+ return weights.at(i).tensor->name;
3177
+ }
3178
+
3179
+ const llama_tensor_weight * get_weight(const char * name) const {
3180
+ for (const auto & weight : weights) {
3181
+ if (strcmp(name, weight.tensor->name) == 0) {
3182
+ return &weight;
3183
+ }
3184
+ }
3185
+ return nullptr;
3186
+ }
3187
+
3188
+ const llama_tensor_weight & require_weight(const char * name) const {
3189
+ const llama_tensor_weight * weight = get_weight(name);
3190
+ if (!weight) {
3191
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3192
+ }
3193
+ return *weight;
3023
3194
  }
3024
3195
 
3025
3196
  struct ggml_tensor * get_tensor_meta(const char * name) const {
3026
- return ggml_get_tensor(ctx_meta, name);
3197
+ const auto * weight = get_weight(name);
3198
+ if (!weight) {
3199
+ return nullptr;
3200
+ }
3201
+ return weight->tensor;
3202
+ }
3203
+
3204
+ struct ggml_tensor * require_tensor_meta(const char * name) const {
3205
+ struct ggml_tensor * tensor = get_tensor_meta(name);
3206
+ if (!tensor) {
3207
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3208
+ }
3209
+ return tensor;
3027
3210
  }
3028
3211
 
3029
3212
  struct ggml_tensor * get_tensor_meta(int i) const {
3030
3213
  return get_tensor_meta(get_tensor_name(i));
3031
3214
  }
3032
3215
 
3033
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
3034
- struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
3035
- ggml_set_name(tensor, ggml_get_name(meta));
3216
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3217
+ struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3218
+ ggml_set_name(tensor, ggml_get_name(cur));
3036
3219
 
3037
3220
  n_created++;
3038
3221
 
3039
3222
  return tensor;
3040
3223
  }
3041
3224
 
3042
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3043
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
3225
+ const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
3226
+ const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
3044
3227
 
3045
3228
  if (cur == NULL) {
3046
3229
  if (!required) {
@@ -3051,8 +3234,8 @@ struct llama_model_loader {
3051
3234
 
3052
3235
  {
3053
3236
  bool is_ok = true;
3054
- for (size_t i = 0; i < ne.size(); ++i) {
3055
- if (ne[i] != cur->ne[i]) {
3237
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3238
+ if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
3056
3239
  is_ok = false;
3057
3240
  break;
3058
3241
  }
@@ -3066,127 +3249,196 @@ struct llama_model_loader {
3066
3249
  }
3067
3250
  }
3068
3251
 
3069
- return create_tensor_for(ctx, cur);
3252
+ return cur;
3070
3253
  }
3071
3254
 
3072
- void done_getting_tensors() const {
3073
- if (n_created != n_tensors) {
3074
- throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3255
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3256
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3257
+
3258
+ if (cur == NULL) {
3259
+ return NULL;
3075
3260
  }
3261
+
3262
+ return create_tensor_for(ctx, cur);
3076
3263
  }
3077
3264
 
3078
- size_t file_offset(const char * name) const {
3079
- const int idx = gguf_find_tensor(ctx_gguf, name);
3265
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
3266
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3080
3267
 
3081
- if (idx < 0) {
3082
- throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
3268
+ if (cur == NULL) {
3269
+ return NULL;
3083
3270
  }
3084
3271
 
3085
- return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
3086
- }
3272
+ if (cur->type != base->type) {
3273
+ throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
3274
+ }
3087
3275
 
3088
- void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
3089
- // prefetch the whole file - all the data is needed anyway
3090
- if (use_mmap) {
3091
- mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
3276
+ std::array<int64_t, GGML_MAX_DIMS> dims;
3277
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3278
+ dims[i] = i < ne.size() ? ne[i] : 1;
3092
3279
  }
3093
3280
 
3094
- // compute the total size of all tensors for progress reporting
3095
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
3096
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
3097
- size_data += ggml_nbytes(cur);
3281
+ struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
3282
+ dims[0], dims[1], dims[2], dims[3],
3283
+ cur->nb[1], cur->nb[2], cur->nb[3],
3284
+ offset);
3285
+
3286
+ ggml_set_name(tensor, name.c_str());
3287
+
3288
+ n_created++;
3289
+
3290
+ return tensor;
3291
+ }
3292
+
3293
+ void done_getting_tensors() const {
3294
+ if (n_created != n_tensors) {
3295
+ throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3098
3296
  }
3297
+ }
3099
3298
 
3100
- if (use_mmap && mapping) {
3101
- if (lmlock) {
3102
- lmlock->init(mapping->addr);
3299
+ void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
3300
+ if (use_mmap) {
3301
+ mappings.reserve(files.size());
3302
+ mmaps_used.reserve(files.size());
3303
+ for (const auto & file : files) {
3304
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3305
+ mmaps_used.emplace_back(mapping->size, 0);
3306
+ if (mlock_mmaps) {
3307
+ std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3308
+ mlock_mmap->init(mapping->addr);
3309
+ mlock_mmaps->emplace_back(std::move(mlock_mmap));
3310
+ }
3311
+ mappings.emplace_back(std::move(mapping));
3103
3312
  }
3104
- mmap_used_first = mapping->size;
3313
+ }
3314
+
3315
+ // compute the total size of all tensors for progress reporting
3316
+ for (auto & w : weights) {
3317
+ size_data += ggml_nbytes(w.tensor);
3105
3318
  }
3106
3319
  }
3107
3320
 
3108
- void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
3109
- GGML_ASSERT(mapping);
3321
+ void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3322
+ GGML_ASSERT(!mappings.empty());
3323
+ const auto & mapping = mappings.at(idx);
3110
3324
 
3111
3325
  *first = mapping->size;
3112
3326
  *last = 0;
3327
+ *addr = mapping->addr;
3113
3328
  for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
3114
- const size_t offs = file_offset(ggml_get_name(tensor));
3115
- *first = std::min(*first, offs);
3116
- *last = std::max(*last, offs + ggml_nbytes(tensor));
3329
+ try {
3330
+ const auto * weight = get_weight(ggml_get_name(tensor));
3331
+ if (!weight) {
3332
+ continue;
3333
+ }
3334
+ if (weight->idx != idx) {
3335
+ continue;
3336
+ }
3337
+ *first = std::min(*first, weight->offs);
3338
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
3339
+ } catch(...) {
3340
+ // the tensor is not in the model
3341
+ }
3117
3342
  }
3118
3343
  }
3119
3344
 
3120
3345
  // for backwards compatibility, does not support ggml-backend
3121
3346
  void load_data_for(struct ggml_tensor * cur) const {
3122
- const size_t offs = file_offset(ggml_get_name(cur));
3347
+ const auto & w = require_weight(ggml_get_name(cur));
3123
3348
 
3124
- if (use_mmap && mapping) {
3349
+ if (use_mmap) {
3350
+ const auto & mapping = mappings.at(w.idx);
3125
3351
  if (cur->data == nullptr) {
3126
- cur->data = (uint8_t *)mapping->addr + offs;
3352
+ cur->data = (uint8_t *)mapping->addr + w.offs;
3127
3353
  } else {
3128
- memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
3354
+ memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
3129
3355
  }
3130
3356
  } else {
3131
3357
  GGML_ASSERT(cur->data != nullptr);
3132
- file.seek(offs, SEEK_SET);
3133
- file.read_raw(cur->data, ggml_nbytes(cur));
3358
+ GGML_ASSERT(w.idx < files.size());
3359
+ const auto & file = files.at(w.idx);
3360
+ file->seek(w.offs, SEEK_SET);
3361
+ file->read_raw(cur->data, ggml_nbytes(cur));
3134
3362
  }
3135
3363
  }
3136
3364
 
3137
3365
  size_t size_done = 0;
3138
3366
  size_t size_data = 0;
3139
- size_t mmap_used_first = -1;
3140
- size_t mmap_used_last = 0;
3367
+ std::vector<std::pair<size_t, size_t>> mmaps_used;
3141
3368
 
3142
3369
  // Returns false if cancelled by progress_callback
3143
- bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
3144
- GGML_ASSERT(size_data != 0 && "call init_mapping() first");
3370
+ bool load_all_data(
3371
+ struct ggml_context * ctx,
3372
+ llama_buf_map & bufs_mmap,
3373
+ llama_mlocks * lmlocks,
3374
+ llama_progress_callback progress_callback,
3375
+ void * progress_callback_user_data) {
3376
+ GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3145
3377
 
3146
3378
  std::vector<no_init<uint8_t>> read_buf;
3147
-
3148
3379
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3380
+ const auto * weight = get_weight(ggml_get_name(cur));
3381
+ if (weight == nullptr) {
3382
+ // this can happen with split experts models
3383
+ continue;
3384
+ }
3385
+
3149
3386
  if (progress_callback) {
3150
3387
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
3151
3388
  return false;
3152
3389
  }
3153
3390
  }
3154
3391
 
3155
- const size_t offs = file_offset(ggml_get_name(cur));
3392
+ size_t n_size = ggml_nbytes(cur);
3156
3393
 
3157
- if (use_mmap && mapping) {
3394
+ if (use_mmap) {
3395
+ const auto & mapping = mappings.at(weight->idx);
3396
+ ggml_backend_buffer_t buf_mmap = nullptr;
3397
+ if (bufs_mmap.count(weight->idx)) {
3398
+ buf_mmap = bufs_mmap.at(weight->idx);
3399
+ }
3400
+ GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3158
3401
  if (buf_mmap && cur->data == nullptr) {
3159
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
3160
- if (lmlock) {
3161
- lmlock->grow_to(offs + ggml_nbytes(cur));
3402
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3403
+ if (lmlocks) {
3404
+ const auto & lmlock = lmlocks->at(weight->idx);
3405
+ lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3162
3406
  }
3163
- mmap_used_first = std::min(mmap_used_first, offs);
3164
- mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
3407
+
3408
+ auto & mmap_used = mmaps_used[weight->idx];
3409
+ mmap_used.first = std::min(mmap_used.first, weight->offs);
3410
+ mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3165
3411
  } else {
3166
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
3412
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3167
3413
  }
3168
3414
  } else {
3415
+ GGML_ASSERT(weight->idx < files.size());
3416
+ const auto & file = files.at(weight->idx);
3169
3417
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3170
- file.seek(offs, SEEK_SET);
3171
- file.read_raw(cur->data, ggml_nbytes(cur));
3418
+ file->seek(weight->offs, SEEK_SET);
3419
+ file->read_raw(cur->data, ggml_nbytes(cur));
3172
3420
  } else {
3173
3421
  read_buf.resize(ggml_nbytes(cur));
3174
- file.seek(offs, SEEK_SET);
3175
- file.read_raw(read_buf.data(), ggml_nbytes(cur));
3176
- ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
3422
+ file->seek(weight->offs, SEEK_SET);
3423
+ file->read_raw(read_buf.data(), ggml_nbytes(cur));
3424
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3177
3425
  }
3178
3426
  }
3179
3427
 
3180
- size_done += ggml_nbytes(cur);
3428
+ size_done += n_size;
3181
3429
  }
3182
3430
 
3183
3431
  // check if this is the last call and do final cleanup
3184
3432
  if (size_done >= size_data) {
3185
3433
  // unmap offloaded tensors and metadata
3186
- if (use_mmap && mapping) {
3187
- mapping->unmap_fragment(0, mmap_used_first);
3188
- if (mmap_used_last != 0) {
3189
- mapping->unmap_fragment(mmap_used_last, mapping->size);
3434
+ if (use_mmap) {
3435
+ for (uint32_t idx = 0; idx < mappings.size(); idx++) {
3436
+ const auto & mmap_used = mmaps_used.at(idx);
3437
+ auto & mapping = mappings.at(idx);
3438
+ mapping->unmap_fragment(0, mmap_used.first);
3439
+ if (mmap_used.second != 0) {
3440
+ mapping->unmap_fragment(mmap_used.second, mapping->size);
3441
+ }
3190
3442
  }
3191
3443
  }
3192
3444
  if (progress_callback) {
@@ -3259,6 +3511,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3259
3511
  case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
3260
3512
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
3261
3513
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
3514
+ case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
3262
3515
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
3263
3516
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
3264
3517
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
@@ -3290,6 +3543,7 @@ static const char * llama_model_type_name(e_model type) {
3290
3543
  case MODEL_40B: return "40B";
3291
3544
  case MODEL_65B: return "65B";
3292
3545
  case MODEL_70B: return "70B";
3546
+ case MODEL_314B: return "314B";
3293
3547
  case MODEL_SMALL: return "0.1B";
3294
3548
  case MODEL_MEDIUM: return "0.4B";
3295
3549
  case MODEL_LARGE: return "0.8B";
@@ -3319,7 +3573,7 @@ static void llm_load_hparams(
3319
3573
  llama_model_loader & ml,
3320
3574
  llama_model & model) {
3321
3575
  auto & hparams = model.hparams;
3322
- const gguf_context * ctx = ml.ctx_gguf;
3576
+ const gguf_context * ctx = ml.meta;
3323
3577
 
3324
3578
  // get metadata as string
3325
3579
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -3428,6 +3682,15 @@ static void llm_load_hparams(
3428
3682
  default: model.type = e_model::MODEL_UNKNOWN;
3429
3683
  }
3430
3684
  } break;
3685
+ case LLM_ARCH_GROK:
3686
+ {
3687
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3688
+
3689
+ switch (hparams.n_layer) {
3690
+ case 64: model.type = e_model::MODEL_314B; break;
3691
+ default: model.type = e_model::MODEL_UNKNOWN;
3692
+ }
3693
+ } break;
3431
3694
  case LLM_ARCH_FALCON:
3432
3695
  {
3433
3696
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3679,6 +3942,16 @@ static void llm_load_hparams(
3679
3942
  default: model.type = e_model::MODEL_UNKNOWN;
3680
3943
  }
3681
3944
  } break;
3945
+ case LLM_ARCH_XVERSE:
3946
+ {
3947
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3948
+ switch (hparams.n_layer) {
3949
+ case 32: model.type = e_model::MODEL_7B; break;
3950
+ case 40: model.type = e_model::MODEL_13B; break;
3951
+ case 80: model.type = e_model::MODEL_65B; break;
3952
+ default: model.type = e_model::MODEL_UNKNOWN;
3953
+ }
3954
+ } break;
3682
3955
  case LLM_ARCH_COMMAND_R:
3683
3956
  {
3684
3957
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -3709,7 +3982,7 @@ static void llm_load_vocab(
3709
3982
  llama_model & model) {
3710
3983
  auto & vocab = model.vocab;
3711
3984
 
3712
- struct gguf_context * ctx = ml.ctx_gguf;
3985
+ struct gguf_context * ctx = ml.meta;
3713
3986
 
3714
3987
  const auto kv = LLM_KV(model.arch);
3715
3988
 
@@ -3842,7 +4115,7 @@ static void llm_load_vocab(
3842
4115
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3843
4116
  vocab.linefeed_id = vocab.special_pad_id;
3844
4117
  } else {
3845
- const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
4118
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
3846
4119
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
3847
4120
  vocab.linefeed_id = ids[0];
3848
4121
  }
@@ -4075,6 +4348,7 @@ static bool llm_load_tensors(
4075
4348
 
4076
4349
  const int64_t n_layer = hparams.n_layer;
4077
4350
  const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
4351
+ bool use_mmap_buffer = true;
4078
4352
 
4079
4353
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
4080
4354
  model.buft_input = llama_default_buffer_type_cpu(true);
@@ -4163,6 +4437,10 @@ static bool llm_load_tensors(
4163
4437
 
4164
4438
  // create one context per buffer type
4165
4439
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4440
+
4441
+ // for moe merged tensors
4442
+ ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4443
+
4166
4444
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4167
4445
  for (auto & it : buft_layer_count) {
4168
4446
  struct ggml_init_params params = {
@@ -4189,6 +4467,11 @@ static bool llm_load_tensors(
4189
4467
  const int64_t n_vocab = hparams.n_vocab;
4190
4468
  const int64_t n_vocab_type = hparams.n_vocab_type;
4191
4469
  const int64_t n_ff = hparams.n_ff;
4470
+ const int64_t n_expert = hparams.n_expert;
4471
+
4472
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
4473
+ throw std::runtime_error("model has expert layers but no expert layers are used");
4474
+ }
4192
4475
 
4193
4476
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4194
4477
 
@@ -4243,26 +4526,113 @@ static bool llm_load_tensors(
4243
4526
 
4244
4527
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4245
4528
 
4246
- layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
4247
-
4248
- if (layer.ffn_gate_inp == nullptr) {
4249
- GGML_ASSERT(hparams.n_expert == 0);
4250
- GGML_ASSERT(hparams.n_expert_used == 0);
4251
-
4529
+ if (n_expert == 0) {
4252
4530
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4253
4531
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4254
4532
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4255
4533
  } else {
4256
- GGML_ASSERT(hparams.n_expert > 0);
4257
- GGML_ASSERT(hparams.n_expert_used > 0);
4258
-
4259
- // MoE branch
4260
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
4261
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
4262
- layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
4263
- layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
4534
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4535
+
4536
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4537
+ if (layer.ffn_gate_exps) {
4538
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4539
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4540
+ } else {
4541
+ // merge split expert into a single tensor for compatibility with older models
4542
+ // requires disabling mmap
4543
+ use_mmap_buffer = false;
4544
+
4545
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4546
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4547
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4548
+
4549
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4550
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4551
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4552
+
4553
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4554
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4555
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4556
+
4557
+ for (uint32_t x = 0; x < n_expert; ++x) {
4558
+ // the individual experts are loaded into a view of the merged tensor
4559
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4560
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4561
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4562
+ }
4563
+ }
4564
+ }
4565
+ }
4566
+ } break;
4567
+ case LLM_ARCH_GROK:
4568
+ {
4569
+ if (n_expert == 0) {
4570
+ throw std::runtime_error("Grok model cannot have zero experts");
4571
+ }
4572
+
4573
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4574
+
4575
+ // output
4576
+ {
4577
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4578
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4579
+ // if output is NULL, init from the input tok embed
4580
+ if (model.output == NULL) {
4581
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4582
+ ml.n_created--; // artificial tensor
4583
+ ml.size_data += ggml_nbytes(model.output);
4584
+ }
4585
+ }
4586
+
4587
+ for (int i = 0; i < n_layer; ++i) {
4588
+ ggml_context * ctx_layer = ctx_for_layer(i);
4589
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4590
+
4591
+ auto & layer = model.layers[i];
4592
+
4593
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4594
+
4595
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4596
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4597
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4598
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4599
+
4600
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4601
+
4602
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4603
+
4604
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4605
+
4606
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4607
+ if (layer.ffn_gate_exps) {
4608
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4609
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4610
+ } else {
4611
+ // merge split expert into a single tensor for compatibility with older models
4612
+ // requires disabling mmap
4613
+ use_mmap_buffer = false;
4614
+
4615
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4616
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4617
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4618
+
4619
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4620
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4621
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4622
+
4623
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4624
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4625
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4626
+
4627
+ for (uint32_t x = 0; x < n_expert; ++x) {
4628
+ // the individual experts are loaded into a view of the merged tensor
4629
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4630
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4631
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4264
4632
  }
4265
4633
  }
4634
+
4635
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4266
4636
  }
4267
4637
  } break;
4268
4638
  case LLM_ARCH_BAICHUAN:
@@ -4319,10 +4689,8 @@ static bool llm_load_tensors(
4319
4689
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4320
4690
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4321
4691
 
4322
- if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
4323
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
4324
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
4325
- }
4692
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
4693
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
4326
4694
 
4327
4695
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4328
4696
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -4502,6 +4870,7 @@ static bool llm_load_tensors(
4502
4870
  case LLM_ARCH_MPT:
4503
4871
  {
4504
4872
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4873
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
4505
4874
 
4506
4875
  // output
4507
4876
  {
@@ -4540,6 +4909,12 @@ static bool llm_load_tensors(
4540
4909
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4541
4910
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4542
4911
 
4912
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
4913
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
4914
+
4915
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
4916
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
4917
+
4543
4918
  // AWQ ScaleActivation layer
4544
4919
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
4545
4920
  }
@@ -4986,6 +5361,28 @@ static bool llm_load_tensors(
4986
5361
  layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
4987
5362
  }
4988
5363
  } break;
5364
+ case LLM_ARCH_XVERSE:
5365
+ {
5366
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5367
+ {
5368
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5369
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5370
+ }
5371
+ for (int i = 0; i < n_layer; ++i) {
5372
+ ggml_context * ctx_layer = ctx_for_layer(i);
5373
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5374
+ auto & layer = model.layers[i];
5375
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5376
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5377
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5378
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5379
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5380
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5381
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5382
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5383
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5384
+ }
5385
+ } break;
4989
5386
  case LLM_ARCH_COMMAND_R:
4990
5387
  {
4991
5388
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5024,56 +5421,97 @@ static bool llm_load_tensors(
5024
5421
 
5025
5422
  ml.done_getting_tensors();
5026
5423
 
5027
- ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
5424
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
5425
+ model.mappings.reserve(ml.mappings.size());
5028
5426
 
5029
5427
  // create the backend buffers
5030
- std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
5428
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
5429
+ ctx_bufs.reserve(ctx_map.size());
5430
+
5431
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
5432
+ size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
5433
+ model.bufs.reserve(n_max_backend_buffer);
5031
5434
 
5032
5435
  for (auto & it : ctx_map) {
5033
5436
  ggml_backend_buffer_type_t buft = it.first;
5034
- ggml_context * ctx = it.second;
5035
- ggml_backend_buffer_t buf = nullptr;
5437
+ ggml_context * ctx = it.second;
5438
+
5439
+ llama_buf_map bufs;
5440
+ bufs.reserve(n_max_backend_buffer);
5036
5441
 
5037
5442
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
5038
5443
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
5039
5444
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
5040
- if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
5041
- size_t first, last;
5042
- ml.get_mapping_range(&first, &last, ctx);
5043
- buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
5044
- #ifdef GGML_USE_CUBLAS
5045
- if (n_layer >= n_gpu_layers) {
5046
- ggml_backend_cuda_register_host_buffer(
5445
+ if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
5446
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5447
+ void * addr = nullptr;
5448
+ size_t first, last;
5449
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5450
+ if (first >= last) {
5451
+ continue;
5452
+ }
5453
+ ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
5454
+ if (buf == nullptr) {
5455
+ throw std::runtime_error("unable to allocate backend CPU buffer");
5456
+ }
5457
+ model.bufs.push_back(buf);
5458
+ bufs.emplace(idx, buf);
5459
+ #ifdef GGML_USE_CUDA
5460
+ if (n_layer >= n_gpu_layers) {
5461
+ ggml_backend_cuda_register_host_buffer(
5047
5462
  ggml_backend_buffer_get_base(buf),
5048
5463
  ggml_backend_buffer_get_size(buf));
5049
- }
5464
+ }
5050
5465
  #endif
5466
+ }
5051
5467
  }
5052
5468
  #ifdef GGML_USE_METAL
5053
- else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
5054
- const size_t max_size = ggml_get_max_tensor_size(ctx);
5055
- size_t first, last;
5056
- ml.get_mapping_range(&first, &last, ctx);
5057
- buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
5469
+ else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
5470
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5471
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
5472
+ void * addr = nullptr;
5473
+ size_t first, last;
5474
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5475
+ if (first >= last) {
5476
+ continue;
5477
+ }
5478
+ ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
5479
+ if (buf == nullptr) {
5480
+ throw std::runtime_error("unable to allocate backend metal buffer");
5481
+ }
5482
+ model.bufs.push_back(buf);
5483
+ bufs.emplace(idx, buf);
5484
+ }
5058
5485
  }
5059
5486
  #endif
5060
5487
  else {
5061
- buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
5062
- if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
5488
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
5489
+ if (buf == nullptr) {
5490
+ throw std::runtime_error("unable to allocate backend buffer");
5491
+ }
5492
+ model.bufs.push_back(buf);
5493
+ if (use_mlock && ggml_backend_buffer_is_host(buf)) {
5063
5494
  model.mlock_bufs.emplace_back(new llama_mlock);
5064
5495
  auto & mlock_buf = model.mlock_bufs.back();
5065
5496
  mlock_buf->init (ggml_backend_buffer_get_base(buf));
5066
5497
  mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
5067
5498
  }
5499
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5500
+ bufs.emplace(idx, buf);
5501
+ }
5068
5502
  }
5069
- if (buf == nullptr) {
5503
+
5504
+ if (bufs.empty()) {
5070
5505
  throw std::runtime_error("failed to allocate buffer");
5071
5506
  }
5072
- // indicate that this buffer contains weights
5073
- // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5074
- ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5075
- model.bufs.push_back(buf);
5076
- ctx_bufs.emplace_back(ctx, buf);
5507
+
5508
+ for (auto & buf : bufs) {
5509
+ // indicate that this buffer contains weights
5510
+ // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5511
+ ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5512
+ }
5513
+
5514
+ ctx_bufs.emplace_back(ctx, bufs);
5077
5515
  }
5078
5516
 
5079
5517
  if (llama_supports_gpu_offload()) {
@@ -5105,13 +5543,17 @@ static bool llm_load_tensors(
5105
5543
  // load tensor data
5106
5544
  for (auto & it : ctx_bufs) {
5107
5545
  ggml_context * ctx = it.first;
5108
- ggml_backend_buffer_t buf = it.second;
5109
- if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
5546
+ auto & bufs = it.second;
5547
+ if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
5110
5548
  return false;
5111
5549
  }
5112
5550
  }
5113
5551
 
5114
- model.mapping = std::move(ml.mapping);
5552
+ if (use_mmap_buffer) {
5553
+ for (auto & mapping : ml.mappings) {
5554
+ model.mappings.emplace_back(std::move(mapping));
5555
+ }
5556
+ }
5115
5557
 
5116
5558
  // loading time will be recalculate after the first eval, so
5117
5559
  // we take page faults deferred by mmap() into consideration
@@ -5266,8 +5708,8 @@ static void llm_build_kv_store(
5266
5708
  GGML_ASSERT(kv.size == n_ctx);
5267
5709
 
5268
5710
  // compute the transposed [n_tokens, n_embd] V matrix
5269
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
5270
- //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
5711
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
5712
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
5271
5713
  cb(v_cur_t, "v_cur_t", il);
5272
5714
 
5273
5715
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
@@ -5451,6 +5893,20 @@ static struct ggml_tensor * llm_build_kqv(
5451
5893
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5452
5894
  }
5453
5895
 
5896
+ if (model.arch == LLM_ARCH_GROK) {
5897
+ // need to do the following:
5898
+ // multiply by attn_output_multiplyer of 0.08838834764831845
5899
+ // and then :
5900
+ // kq = 30 * tanh(kq / 30)
5901
+ // before the softmax below
5902
+
5903
+ //try from phi2
5904
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5905
+
5906
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
5907
+ kq = ggml_scale(ctx, kq, 30);
5908
+ }
5909
+
5454
5910
  #if defined(GGML_USE_KOMPUTE)
5455
5911
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
5456
5912
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
@@ -5577,7 +6033,8 @@ struct llm_build_context {
5577
6033
  const float norm_rms_eps;
5578
6034
 
5579
6035
  const int32_t n_tokens;
5580
- const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
6036
+ const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
6037
+ const int32_t n_outputs;
5581
6038
  const int32_t kv_head; // index of where we store new KV data in the cache
5582
6039
  const int32_t n_orig_ctx;
5583
6040
 
@@ -5624,6 +6081,7 @@ struct llm_build_context {
5624
6081
  norm_rms_eps (hparams.f_norm_rms_eps),
5625
6082
  n_tokens (batch.n_tokens),
5626
6083
  n_kv (worst_case ? kv_self.size : kv_self.n),
6084
+ n_outputs (worst_case ? n_tokens : lctx.n_outputs),
5627
6085
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
5628
6086
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5629
6087
  pooling_type (cparams.pooling_type),
@@ -5645,6 +6103,7 @@ struct llm_build_context {
5645
6103
  lctx.inp_tokens = nullptr;
5646
6104
  lctx.inp_embd = nullptr;
5647
6105
  lctx.inp_pos = nullptr;
6106
+ lctx.inp_out_ids = nullptr;
5648
6107
  lctx.inp_KQ_mask = nullptr;
5649
6108
  lctx.inp_KQ_pos = nullptr;
5650
6109
  lctx.inp_K_shift = nullptr;
@@ -5768,6 +6227,13 @@ struct llm_build_context {
5768
6227
  return lctx.inp_pos;
5769
6228
  }
5770
6229
 
6230
+ struct ggml_tensor * build_inp_out_ids() {
6231
+ lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
6232
+ cb(lctx.inp_out_ids, "inp_out_ids", -1);
6233
+ ggml_set_input(lctx.inp_out_ids);
6234
+ return lctx.inp_out_ids;
6235
+ }
6236
+
5771
6237
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
5772
6238
  if (causal) {
5773
6239
  lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
@@ -5824,6 +6290,9 @@ struct llm_build_context {
5824
6290
  struct ggml_cgraph * build_llama() {
5825
6291
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5826
6292
 
6293
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6294
+ int32_t n_tokens = this->n_tokens;
6295
+
5827
6296
  const int64_t n_embd_head = hparams.n_embd_head_v;
5828
6297
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5829
6298
  GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5891,6 +6360,14 @@ struct llm_build_context {
5891
6360
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5892
6361
  }
5893
6362
 
6363
+ if (il == n_layer - 1) {
6364
+ // skip computing output for unused tokens
6365
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6366
+ n_tokens = n_outputs;
6367
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6368
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6369
+ }
6370
+
5894
6371
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5895
6372
  cb(ffn_inp, "ffn_inp", il);
5896
6373
 
@@ -5943,19 +6420,19 @@ struct llm_build_context {
5943
6420
  for (int i = 0; i < n_expert_used; ++i) {
5944
6421
  ggml_tensor * cur_expert;
5945
6422
 
5946
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
6423
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
5947
6424
  cb(cur_up, "ffn_moe_up", il);
5948
6425
 
5949
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
6426
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
5950
6427
  cb(cur_gate, "ffn_moe_gate", il);
5951
6428
 
5952
6429
  cur_gate = ggml_silu(ctx0, cur_gate);
5953
6430
  cb(cur_gate, "ffn_moe_silu", il);
5954
6431
 
5955
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
6432
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
5956
6433
  cb(cur_expert, "ffn_moe_gate_par", il);
5957
6434
 
5958
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6435
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
5959
6436
  cb(cur_expert, "ffn_moe_down", il);
5960
6437
 
5961
6438
  cur_expert = ggml_mul(ctx0, cur_expert,
@@ -6070,6 +6547,13 @@ struct llm_build_context {
6070
6547
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6071
6548
  }
6072
6549
 
6550
+ if (il == n_layer - 1) {
6551
+ // skip computing output for unused tokens
6552
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6553
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6554
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6555
+ }
6556
+
6073
6557
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6074
6558
  cb(ffn_inp, "ffn_inp", il);
6075
6559
 
@@ -6112,6 +6596,111 @@ struct llm_build_context {
6112
6596
  return gf;
6113
6597
  }
6114
6598
 
6599
+ struct ggml_cgraph * build_xverse() {
6600
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6601
+
6602
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6603
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6604
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6605
+
6606
+ struct ggml_tensor * cur;
6607
+ struct ggml_tensor * inpL;
6608
+
6609
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6610
+
6611
+ // inp_pos - contains the positions
6612
+ struct ggml_tensor * inp_pos = build_inp_pos();
6613
+
6614
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6615
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6616
+
6617
+ // positions of the tokens in the KV cache
6618
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6619
+
6620
+ for (int il = 0; il < n_layer; ++il) {
6621
+ struct ggml_tensor * inpSA = inpL;
6622
+
6623
+ cur = llm_build_norm(ctx0, inpL, hparams,
6624
+ model.layers[il].attn_norm, NULL,
6625
+ LLM_NORM_RMS, cb, il);
6626
+ cb(cur, "attn_norm", il);
6627
+
6628
+ // self-attention
6629
+ {
6630
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6631
+ cb(Qcur, "Qcur", il);
6632
+
6633
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6634
+ cb(Kcur, "Kcur", il);
6635
+
6636
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6637
+ cb(Vcur, "Vcur", il);
6638
+
6639
+ Qcur = ggml_rope_custom(
6640
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6641
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6642
+ ext_factor, attn_factor, beta_fast, beta_slow
6643
+ );
6644
+ cb(Qcur, "Qcur", il);
6645
+
6646
+ Kcur = ggml_rope_custom(
6647
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6648
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6649
+ ext_factor, attn_factor, beta_fast, beta_slow
6650
+ );
6651
+ cb(Kcur, "Kcur", il);
6652
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6653
+ model.layers[il].wo, NULL,
6654
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6655
+ }
6656
+
6657
+ if (il == n_layer - 1) {
6658
+ // skip computing output for unused tokens
6659
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6660
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6661
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6662
+ }
6663
+
6664
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6665
+ cb(ffn_inp, "ffn_inp", il);
6666
+
6667
+ // feed-forward network
6668
+ {
6669
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6670
+ model.layers[il].ffn_norm, NULL,
6671
+ LLM_NORM_RMS, cb, il);
6672
+ cb(cur, "ffn_norm", il);
6673
+
6674
+ cur = llm_build_ffn(ctx0, cur,
6675
+ model.layers[il].ffn_up, NULL,
6676
+ model.layers[il].ffn_gate, NULL,
6677
+ model.layers[il].ffn_down, NULL,
6678
+ NULL,
6679
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6680
+ cb(cur, "ffn_out", il);
6681
+ }
6682
+
6683
+ cur = ggml_add(ctx0, cur, ffn_inp);
6684
+ cb(cur, "l_out", il);
6685
+
6686
+ // input for next layer
6687
+ inpL = cur;
6688
+ }
6689
+
6690
+ cur = inpL;
6691
+
6692
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
6693
+ cb(cur, "result_norm", -1);
6694
+
6695
+ // lm_head
6696
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6697
+ cb(cur, "result_output", -1);
6698
+
6699
+ ggml_build_forward_expand(gf, cur);
6700
+
6701
+ return gf;
6702
+ }
6703
+
6115
6704
  struct ggml_cgraph * build_falcon() {
6116
6705
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6117
6706
 
@@ -6185,6 +6774,14 @@ struct llm_build_context {
6185
6774
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6186
6775
  }
6187
6776
 
6777
+ if (il == n_layer - 1) {
6778
+ // skip computing output for unused tokens
6779
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6780
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6781
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6782
+ attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
6783
+ }
6784
+
6188
6785
  struct ggml_tensor * ffn_inp = cur;
6189
6786
 
6190
6787
  // feed forward
@@ -6225,6 +6822,214 @@ struct llm_build_context {
6225
6822
  return gf;
6226
6823
  }
6227
6824
 
6825
+ struct ggml_cgraph * build_grok() {
6826
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6827
+
6828
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6829
+ int32_t n_tokens = this->n_tokens;
6830
+
6831
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6832
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6833
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6834
+
6835
+ struct ggml_tensor * cur;
6836
+ struct ggml_tensor * inpL;
6837
+
6838
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6839
+
6840
+ // multiply by embedding_multiplier_scale of 78.38367176906169
6841
+ inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
6842
+
6843
+ // inp_pos - contains the positions
6844
+ struct ggml_tensor * inp_pos = build_inp_pos();
6845
+
6846
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6847
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6848
+
6849
+ for (int il = 0; il < n_layer; ++il) {
6850
+ struct ggml_tensor * inpSA = inpL;
6851
+
6852
+ // norm
6853
+ cur = llm_build_norm(ctx0, inpL, hparams,
6854
+ model.layers[il].attn_norm, NULL,
6855
+ LLM_NORM_RMS, cb, il);
6856
+ cb(cur, "attn_norm", il);
6857
+
6858
+
6859
+ // self-attention
6860
+ {
6861
+ // compute Q and K and RoPE them
6862
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6863
+ cb(Qcur, "Qcur", il);
6864
+ if (model.layers[il].bq) {
6865
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6866
+ cb(Qcur, "Qcur", il);
6867
+ }
6868
+
6869
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6870
+ cb(Kcur, "Kcur", il);
6871
+ if (model.layers[il].bk) {
6872
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6873
+ cb(Kcur, "Kcur", il);
6874
+ }
6875
+
6876
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6877
+ cb(Vcur, "Vcur", il);
6878
+ if (model.layers[il].bv) {
6879
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6880
+ cb(Vcur, "Vcur", il);
6881
+ }
6882
+
6883
+ Qcur = ggml_rope_custom(
6884
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6885
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6886
+ ext_factor, attn_factor, beta_fast, beta_slow
6887
+ );
6888
+ cb(Qcur, "Qcur", il);
6889
+
6890
+ Kcur = ggml_rope_custom(
6891
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6892
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6893
+ ext_factor, attn_factor, beta_fast, beta_slow
6894
+ );
6895
+ cb(Kcur, "Kcur", il);
6896
+
6897
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6898
+ model.layers[il].wo, model.layers[il].bo,
6899
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6900
+ }
6901
+
6902
+ if (il == n_layer - 1) {
6903
+ // skip computing output for unused tokens
6904
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6905
+ n_tokens = n_outputs;
6906
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6907
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6908
+ }
6909
+
6910
+ // Grok
6911
+ // if attn_out_norm is present then apply it before adding the input
6912
+ if (model.layers[il].attn_out_norm) {
6913
+ cur = llm_build_norm(ctx0, cur, hparams,
6914
+ model.layers[il].attn_out_norm, NULL,
6915
+ LLM_NORM_RMS, cb, il);
6916
+ cb(cur, "attn_out_norm", il);
6917
+ }
6918
+
6919
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6920
+ cb(ffn_inp, "ffn_inp", il);
6921
+
6922
+ // feed-forward network
6923
+ // MoE branch
6924
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6925
+ model.layers[il].ffn_norm, NULL,
6926
+ LLM_NORM_RMS, cb, il);
6927
+ cb(cur, "ffn_norm", il);
6928
+
6929
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6930
+ cb(logits, "ffn_moe_logits", il);
6931
+
6932
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6933
+ cb(probs, "ffn_moe_probs", il);
6934
+
6935
+ // select experts
6936
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6937
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6938
+
6939
+ ggml_tensor * weights = ggml_get_rows(ctx0,
6940
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6941
+ cb(weights, "ffn_moe_weights", il);
6942
+
6943
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6944
+
6945
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6946
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6947
+
6948
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6949
+ cb(weights, "ffn_moe_weights_norm", il);
6950
+
6951
+ // compute expert outputs
6952
+ ggml_tensor * moe_out = nullptr;
6953
+
6954
+ for (int i = 0; i < n_expert_used; ++i) {
6955
+ ggml_tensor * cur_expert;
6956
+
6957
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6958
+ cb(cur_up, "ffn_moe_up", il);
6959
+
6960
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6961
+ cb(cur_gate, "ffn_moe_gate", il);
6962
+
6963
+ //GeLU
6964
+ cur_gate = ggml_gelu(ctx0, cur_gate);
6965
+ cb(cur_gate, "ffn_moe_gelu", il);
6966
+
6967
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6968
+ cb(cur_expert, "ffn_moe_gate_par", il);
6969
+
6970
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6971
+ cb(cur_expert, "ffn_moe_down", il);
6972
+
6973
+ cur_expert = ggml_mul(ctx0, cur_expert,
6974
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6975
+ cb(cur_expert, "ffn_moe_weighted", il);
6976
+
6977
+ if (i == 0) {
6978
+ moe_out = cur_expert;
6979
+ } else {
6980
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
6981
+ cb(moe_out, "ffn_moe_out", il);
6982
+ }
6983
+ }
6984
+
6985
+ cur = moe_out;
6986
+
6987
+ // Grok
6988
+ // if layer_out_norm is present then apply it before adding the input
6989
+ // Idea: maybe ffn_out_norm is a better name
6990
+ if (model.layers[il].layer_out_norm) {
6991
+ cur = llm_build_norm(ctx0, cur, hparams,
6992
+ model.layers[il].layer_out_norm, NULL,
6993
+ LLM_NORM_RMS, cb, il);
6994
+ cb(cur, "layer_out_norm", il);
6995
+ }
6996
+
6997
+
6998
+ cur = ggml_add(ctx0, cur, ffn_inp);
6999
+ cb(cur, "ffn_out", il);
7000
+
7001
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7002
+ if (layer_dir != nullptr) {
7003
+ cur = ggml_add(ctx0, cur, layer_dir);
7004
+ }
7005
+ cb(cur, "l_out", il);
7006
+
7007
+ // input for next layer
7008
+ inpL = cur;
7009
+ }
7010
+
7011
+ cur = inpL;
7012
+
7013
+ cur = llm_build_norm(ctx0, cur, hparams,
7014
+ model.output_norm, NULL,
7015
+ LLM_NORM_RMS, cb, -1);
7016
+ cb(cur, "result_norm", -1);
7017
+
7018
+ // lm_head
7019
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7020
+
7021
+ // Grok
7022
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
7023
+
7024
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7025
+
7026
+ cb(cur, "result_output", -1);
7027
+
7028
+ ggml_build_forward_expand(gf, cur);
7029
+
7030
+ return gf;
7031
+ }
7032
+
6228
7033
  struct ggml_cgraph * build_starcoder() {
6229
7034
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6230
7035
 
@@ -6279,6 +7084,13 @@ struct llm_build_context {
6279
7084
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6280
7085
  }
6281
7086
 
7087
+ if (il == n_layer - 1) {
7088
+ // skip computing output for unused tokens
7089
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7090
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7091
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7092
+ }
7093
+
6282
7094
  // add the input
6283
7095
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6284
7096
  cb(ffn_inp, "ffn_inp", il);
@@ -6476,6 +7288,13 @@ struct llm_build_context {
6476
7288
  Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6477
7289
  }
6478
7290
 
7291
+ if (il == n_layer - 1) {
7292
+ // skip computing output for unused tokens
7293
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7294
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7295
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
7296
+ }
7297
+
6479
7298
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
6480
7299
  cb(ffn_inp, "ffn_inp", il);
6481
7300
 
@@ -6565,6 +7384,13 @@ struct llm_build_context {
6565
7384
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6566
7385
  }
6567
7386
 
7387
+ if (il == n_layer - 1) {
7388
+ // skip computing output for unused tokens
7389
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7390
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7391
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7392
+ }
7393
+
6568
7394
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6569
7395
  cb(ffn_inp, "ffn_inp", il);
6570
7396
 
@@ -6722,6 +7548,13 @@ struct llm_build_context {
6722
7548
  }
6723
7549
  cb(cur, "kqv_out", il);
6724
7550
 
7551
+ if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
7552
+ // skip computing output for unused tokens
7553
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7554
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7555
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7556
+ }
7557
+
6725
7558
  // re-add the layer input
6726
7559
  cur = ggml_add(ctx0, cur, inpL);
6727
7560
 
@@ -6844,6 +7677,13 @@ struct llm_build_context {
6844
7677
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6845
7678
  }
6846
7679
 
7680
+ if (il == n_layer - 1) {
7681
+ // skip computing output for unused tokens
7682
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7683
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7684
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7685
+ }
7686
+
6847
7687
  // Add the input
6848
7688
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6849
7689
  cb(ffn_inp, "ffn_inp", il);
@@ -6891,6 +7731,7 @@ struct llm_build_context {
6891
7731
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6892
7732
 
6893
7733
  struct ggml_tensor * cur;
7734
+ struct ggml_tensor * pos;
6894
7735
  struct ggml_tensor * inpL;
6895
7736
 
6896
7737
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -6901,6 +7742,16 @@ struct llm_build_context {
6901
7742
  // positions of the tokens in the KV cache
6902
7743
  struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6903
7744
 
7745
+ if (model.pos_embd) {
7746
+ // inp_pos - contains the positions
7747
+ struct ggml_tensor * inp_pos = build_inp_pos();
7748
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7749
+ cb(pos, "pos_embd", -1);
7750
+
7751
+ inpL = ggml_add(ctx0, inpL, pos);
7752
+ cb(inpL, "inpL", -1);
7753
+ }
7754
+
6904
7755
  for (int il = 0; il < n_layer; ++il) {
6905
7756
  struct ggml_tensor * attn_norm;
6906
7757
 
@@ -6935,11 +7786,39 @@ struct llm_build_context {
6935
7786
  cb(Kcur, "Kcur", il);
6936
7787
  cb(Vcur, "Vcur", il);
6937
7788
 
6938
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7789
+ // Q/K Layernorm
7790
+ if (model.layers[il].attn_q_norm) {
7791
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
7792
+ model.layers[il].attn_q_norm,
7793
+ model.layers[il].attn_q_norm_b,
7794
+ LLM_NORM, cb, il);
7795
+ cb(Qcur, "Qcur", il);
6939
7796
 
6940
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7797
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
7798
+ model.layers[il].attn_k_norm,
7799
+ model.layers[il].attn_k_norm_b,
7800
+ LLM_NORM, cb, il);
7801
+ cb(Kcur, "Kcur", il);
7802
+
7803
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7804
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7805
+
7806
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6941
7807
  model.layers[il].wo, model.layers[il].bo,
6942
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7808
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7809
+ } else {
7810
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7811
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7812
+ model.layers[il].wo, model.layers[il].bo,
7813
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7814
+ }
7815
+ }
7816
+
7817
+ if (il == n_layer - 1) {
7818
+ // skip computing output for unused tokens
7819
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7820
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7821
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6943
7822
  }
6944
7823
 
6945
7824
  // Add the input
@@ -7055,6 +7934,13 @@ struct llm_build_context {
7055
7934
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7056
7935
  }
7057
7936
 
7937
+ if (il == n_layer - 1) {
7938
+ // skip computing output for unused tokens
7939
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7940
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7941
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7942
+ }
7943
+
7058
7944
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7059
7945
  cb(ffn_inp, "ffn_inp", il);
7060
7946
 
@@ -7161,6 +8047,13 @@ struct llm_build_context {
7161
8047
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7162
8048
  }
7163
8049
 
8050
+ if (il == n_layer - 1) {
8051
+ // skip computing output for unused tokens
8052
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8053
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8054
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8055
+ }
8056
+
7164
8057
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7165
8058
  cb(ffn_inp, "ffn_inp", il);
7166
8059
 
@@ -7273,6 +8166,13 @@ struct llm_build_context {
7273
8166
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7274
8167
  }
7275
8168
 
8169
+ if (il == n_layer - 1) {
8170
+ // skip computing output for unused tokens
8171
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8172
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8173
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8174
+ }
8175
+
7276
8176
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7277
8177
  cb(ffn_inp, "ffn_inp", il);
7278
8178
 
@@ -7391,6 +8291,14 @@ struct llm_build_context {
7391
8291
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7392
8292
  }
7393
8293
 
8294
+ if (il == n_layer - 1) {
8295
+ // skip computing output for unused tokens
8296
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8297
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8298
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8299
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
8300
+ }
8301
+
7394
8302
  // FF
7395
8303
  {
7396
8304
  ffn_output = llm_build_ffn(ctx0, attn_norm_output,
@@ -7488,6 +8396,14 @@ struct llm_build_context {
7488
8396
 
7489
8397
  cur = attention_norm;
7490
8398
 
8399
+ if (il == n_layer - 1) {
8400
+ // skip computing output for unused tokens
8401
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8402
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8403
+ sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
8404
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8405
+ }
8406
+
7491
8407
  // feed-forward network
7492
8408
  {
7493
8409
  cur = llm_build_ffn(ctx0, cur,
@@ -7580,6 +8496,13 @@ struct llm_build_context {
7580
8496
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7581
8497
  }
7582
8498
 
8499
+ if (il == n_layer - 1) {
8500
+ // skip computing output for unused tokens
8501
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8502
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8503
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8504
+ }
8505
+
7583
8506
  // add the input
7584
8507
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7585
8508
  cb(ffn_inp, "ffn_inp", il);
@@ -7680,6 +8603,13 @@ struct llm_build_context {
7680
8603
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7681
8604
  }
7682
8605
 
8606
+ if (il == n_layer - 1) {
8607
+ // skip computing output for unused tokens
8608
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8609
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8610
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8611
+ }
8612
+
7683
8613
  // add the input
7684
8614
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7685
8615
  cb(ffn_inp, "ffn_inp", il);
@@ -7789,6 +8719,13 @@ struct llm_build_context {
7789
8719
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7790
8720
  }
7791
8721
 
8722
+ if (il == n_layer - 1) {
8723
+ // skip computing output for unused tokens
8724
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8725
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8726
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8727
+ }
8728
+
7792
8729
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7793
8730
  cb(ffn_inp, "ffn_inp", il);
7794
8731
 
@@ -7899,6 +8836,13 @@ struct llm_build_context {
7899
8836
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7900
8837
  }
7901
8838
 
8839
+ if (il == n_layer - 1) {
8840
+ // skip computing output for unused tokens
8841
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8842
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8843
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8844
+ }
8845
+
7902
8846
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7903
8847
  cb(ffn_inp, "ffn_inp", il);
7904
8848
 
@@ -8022,6 +8966,13 @@ struct llm_build_context {
8022
8966
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8023
8967
  }
8024
8968
 
8969
+ if (il == n_layer - 1) {
8970
+ // skip computing output for unused tokens
8971
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8972
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8973
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8974
+ }
8975
+
8025
8976
  // scale_res - scale the hidden states for residual connection
8026
8977
  const float scale_res = scale_depth/sqrtf(float(n_layer));
8027
8978
  cur = ggml_scale(ctx0, cur, scale_res);
@@ -8136,6 +9087,13 @@ struct llm_build_context {
8136
9087
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8137
9088
  }
8138
9089
 
9090
+ if (il == n_layer - 1) {
9091
+ // skip computing output for unused tokens
9092
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9093
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9094
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9095
+ }
9096
+
8139
9097
  struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8140
9098
  cb(sa_out, "sa_out", il);
8141
9099
 
@@ -8248,6 +9206,13 @@ struct llm_build_context {
8248
9206
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8249
9207
  }
8250
9208
 
9209
+ if (il == n_layer - 1) {
9210
+ // skip computing output for unused tokens
9211
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9212
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9213
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9214
+ }
9215
+
8251
9216
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8252
9217
  cb(ffn_inp, "ffn_inp", il);
8253
9218
 
@@ -8395,6 +9360,15 @@ struct llm_build_context {
8395
9360
 
8396
9361
  struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
8397
9362
 
9363
+ if (il == n_layer - 1) {
9364
+ // skip computing output for unused tokens
9365
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9366
+ x = ggml_get_rows(ctx0, x, inp_out_ids);
9367
+ y = ggml_get_rows(ctx0, y, inp_out_ids);
9368
+ z = ggml_get_rows(ctx0, z, inp_out_ids);
9369
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9370
+ }
9371
+
8398
9372
  // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
8399
9373
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
8400
9374
  y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
@@ -8497,6 +9471,14 @@ struct llm_build_context {
8497
9471
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8498
9472
  }
8499
9473
 
9474
+ if (il == n_layer - 1) {
9475
+ // skip computing output for unused tokens
9476
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9477
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9478
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9479
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
9480
+ }
9481
+
8500
9482
  struct ggml_tensor * attn_out = cur;
8501
9483
 
8502
9484
  // feed-forward network
@@ -8648,6 +9630,10 @@ static struct ggml_cgraph * llama_build_graph(
8648
9630
  {
8649
9631
  result = llm.build_falcon();
8650
9632
  } break;
9633
+ case LLM_ARCH_GROK:
9634
+ {
9635
+ result = llm.build_grok();
9636
+ } break;
8651
9637
  case LLM_ARCH_STARCODER:
8652
9638
  {
8653
9639
  result = llm.build_starcoder();
@@ -8725,6 +9711,10 @@ static struct ggml_cgraph * llama_build_graph(
8725
9711
  {
8726
9712
  result = llm.build_mamba();
8727
9713
  } break;
9714
+ case LLM_ARCH_XVERSE:
9715
+ {
9716
+ result = llm.build_xverse();
9717
+ } break;
8728
9718
  case LLM_ARCH_COMMAND_R:
8729
9719
  {
8730
9720
  result = llm.build_command_r();
@@ -8790,9 +9780,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8790
9780
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
8791
9781
  }
8792
9782
 
9783
+ if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
9784
+ GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
9785
+ const int64_t n_tokens = batch.n_tokens;
9786
+
9787
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
9788
+ int32_t * data = (int32_t *) lctx.inp_out_ids->data;
9789
+
9790
+ if (lctx.n_outputs == n_tokens) {
9791
+ for (int i = 0; i < n_tokens; ++i) {
9792
+ data[i] = i;
9793
+ }
9794
+ } else if (batch.logits) {
9795
+ int32_t n_outputs = 0;
9796
+ for (int i = 0; i < n_tokens; ++i) {
9797
+ if (batch.logits[i]) {
9798
+ data[n_outputs++] = i;
9799
+ }
9800
+ }
9801
+ // the graph needs to have been passed the correct number of outputs
9802
+ GGML_ASSERT(lctx.n_outputs == n_outputs);
9803
+ } else if (lctx.n_outputs == 1) {
9804
+ // only keep last output
9805
+ data[0] = n_tokens - 1;
9806
+ } else {
9807
+ GGML_ASSERT(lctx.n_outputs == 0);
9808
+ }
9809
+ }
9810
+
8793
9811
  GGML_ASSERT(
9812
+ // (!a || b) is a logical implication (a -> b)
9813
+ // !hparams.causal_attn -> !cparams.causal_attn
8794
9814
  (hparams.causal_attn || !cparams.causal_attn) &&
8795
- "non-causal attention with generative models is not supported"
9815
+ "causal attention with embedding models is not supported"
8796
9816
  );
8797
9817
 
8798
9818
  if (lctx.inp_KQ_mask) {
@@ -8971,6 +9991,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8971
9991
  }
8972
9992
  }
8973
9993
 
9994
+ // Make sure enough space is available for outputs.
9995
+ // Returns max number of outputs for which space was reserved.
9996
+ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
9997
+ const auto & cparams = lctx.cparams;
9998
+ const auto & hparams = lctx.model.hparams;
9999
+
10000
+ const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
10001
+
10002
+ const auto n_batch = cparams.n_batch;
10003
+ const auto n_vocab = hparams.n_vocab;
10004
+ const auto n_embd = hparams.n_embd;
10005
+
10006
+ // TODO: use a per-batch flag for logits presence instead
10007
+ const bool has_logits = cparams.causal_attn;
10008
+ const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
10009
+
10010
+ const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
10011
+ const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
10012
+
10013
+ if (lctx.output_ids.empty()) {
10014
+ // init, never resized afterwards
10015
+ lctx.output_ids.resize(n_batch);
10016
+ }
10017
+
10018
+ const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
10019
+ const size_t new_size = (logits_size + embd_size) * sizeof(float);
10020
+
10021
+ // alloc only when more than the current capacity is required
10022
+ // TODO: also consider shrinking the buffer
10023
+ if (!lctx.buf_output || prev_size < new_size) {
10024
+ if (lctx.buf_output) {
10025
+ #ifndef NDEBUG
10026
+ // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
10027
+ LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
10028
+ #endif
10029
+ ggml_backend_buffer_free(lctx.buf_output);
10030
+ lctx.buf_output = nullptr;
10031
+ lctx.logits = nullptr;
10032
+ lctx.embd = nullptr;
10033
+ }
10034
+
10035
+ lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
10036
+ if (lctx.buf_output == nullptr) {
10037
+ LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
10038
+ return 0;
10039
+ }
10040
+ }
10041
+
10042
+ float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
10043
+
10044
+ lctx.logits = has_logits ? output_base : nullptr;
10045
+ lctx.embd = has_embd ? output_base + logits_size : nullptr;
10046
+
10047
+ lctx.output_size = n_outputs_max;
10048
+ lctx.logits_size = logits_size;
10049
+ lctx.embd_size = embd_size;
10050
+
10051
+ // set all ids as invalid (negative)
10052
+ std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
10053
+
10054
+ ggml_backend_buffer_clear(lctx.buf_output, 0);
10055
+
10056
+ lctx.n_outputs = 0;
10057
+
10058
+ return n_outputs_max;
10059
+ }
10060
+
10061
+
8974
10062
  static void llama_graph_compute(
8975
10063
  llama_context & lctx,
8976
10064
  ggml_cgraph * gf,
@@ -9046,16 +10134,8 @@ static int llama_decode_internal(
9046
10134
  const int64_t n_embd = hparams.n_embd;
9047
10135
  const int64_t n_vocab = hparams.n_vocab;
9048
10136
 
9049
-
9050
- auto * logits_out = lctx.logits;
9051
-
9052
- #ifndef NDEBUG
9053
- auto & logits_valid = lctx.logits_valid;
9054
- logits_valid.clear();
9055
- logits_valid.resize(n_tokens_all);
9056
-
9057
- memset(logits_out, 0, lctx.logits_size*sizeof(float));
9058
- #endif
10137
+ uint32_t n_outputs = 0;
10138
+ uint32_t n_outputs_prev = 0;
9059
10139
 
9060
10140
  const auto n_ubatch = cparams.n_ubatch;
9061
10141
 
@@ -9064,6 +10144,38 @@ static int llama_decode_internal(
9064
10144
  std::vector<llama_seq_id *> seq_id_arr;
9065
10145
  std::vector<std::vector<llama_seq_id>> seq_id;
9066
10146
 
10147
+ // count outputs
10148
+ if (batch_all.logits) {
10149
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10150
+ n_outputs += batch_all.logits[i] != 0;
10151
+ }
10152
+ } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
10153
+ n_outputs = n_tokens_all;
10154
+ } else {
10155
+ // keep last output only
10156
+ n_outputs = 1;
10157
+ }
10158
+
10159
+ // reserve output buffer
10160
+ if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
10161
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
10162
+ return -2;
10163
+ };
10164
+
10165
+ // set output mappings
10166
+ if (batch_all.logits) {
10167
+ int32_t i_logits = 0;
10168
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10169
+ if (batch_all.logits[i]) {
10170
+ lctx.output_ids[i] = i_logits++;
10171
+ }
10172
+ }
10173
+ } else {
10174
+ for (uint32_t i = 0; i < n_outputs; ++i) {
10175
+ lctx.output_ids[i] = i;
10176
+ }
10177
+ }
10178
+
9067
10179
  for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
9068
10180
  const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
9069
10181
  llama_batch u_batch = {
@@ -9079,6 +10191,27 @@ static int llama_decode_internal(
9079
10191
  /* .all_seq_id = */ batch_all.all_seq_id,
9080
10192
  };
9081
10193
 
10194
+ // count the outputs in this u_batch
10195
+ {
10196
+ int32_t n_outputs_new = 0;
10197
+
10198
+ if (u_batch.logits) {
10199
+ for (uint32_t i = 0; i < n_tokens; i++) {
10200
+ n_outputs_new += u_batch.logits[i] != 0;
10201
+ }
10202
+ } else if (n_outputs == n_tokens_all) {
10203
+ n_outputs_new = n_tokens;
10204
+ } else {
10205
+ // keep last output only
10206
+ if (cur_token + n_tokens >= n_tokens_all) {
10207
+ n_outputs_new = 1;
10208
+ }
10209
+ }
10210
+
10211
+ // needs to happen before the graph is built
10212
+ lctx.n_outputs = n_outputs_new;
10213
+ }
10214
+
9082
10215
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
9083
10216
  GGML_ASSERT(n_threads > 0);
9084
10217
 
@@ -9142,23 +10275,37 @@ static int llama_decode_internal(
9142
10275
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
9143
10276
  struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
9144
10277
 
9145
- if (!hparams.causal_attn) {
10278
+ if (lctx.n_outputs == 0) {
10279
+ // no output
10280
+ res = nullptr;
10281
+ embd = nullptr;
10282
+ } else if (!hparams.causal_attn) {
9146
10283
  res = nullptr; // do not extract logits for embedding models such as BERT
9147
10284
 
9148
10285
  // token or sequence embeddings
9149
10286
  embd = gf->nodes[gf->n_nodes - 1];
9150
10287
 
9151
10288
  GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
9152
- } else {
9153
- if (strcmp(res->name, "result_output") == 0) {
9154
- // the token embeddings could be the second to last tensor, or the third to last tensor
9155
- if (strcmp(embd->name, "result_norm") != 0) {
9156
- embd = gf->nodes[gf->n_nodes - 3];
9157
- GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
9158
- }
9159
- } else {
9160
- GGML_ASSERT(false && "missing result_output tensor");
10289
+ } else if (cparams.embeddings) {
10290
+ // the embeddings could be in the second to last tensor, or any of the previous tensors
10291
+ int i_embd = gf->n_nodes - 2;
10292
+ for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
10293
+ i_embd = gf->n_nodes - i;
10294
+ if (i_embd < 0) { break; }
10295
+ embd = gf->nodes[i_embd];
10296
+ }
10297
+ GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
10298
+
10299
+ // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
10300
+ if (!cparams.causal_attn) {
10301
+ res = nullptr; // do not extract logits when not needed
10302
+ // skip computing logits
10303
+ // TODO: is this safe?
10304
+ gf->n_nodes = i_embd + 1;
9161
10305
  }
10306
+ } else {
10307
+ embd = nullptr; // do not extract embeddings when not needed
10308
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
9162
10309
  }
9163
10310
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
9164
10311
 
@@ -9201,50 +10348,23 @@ static int llama_decode_internal(
9201
10348
  //}
9202
10349
 
9203
10350
  // extract logits
9204
- // TODO: do not compute and extract logits if only embeddings are needed
9205
- // update the graphs to skip "result_output" if logits are not needed
9206
10351
  if (res) {
9207
10352
  ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
9208
10353
  GGML_ASSERT(backend_res != nullptr);
9209
- if (u_batch.logits) {
9210
- int32_t i_first = -1;
9211
- for (uint32_t i = 0; i < n_tokens; i++) {
9212
- if (u_batch.logits[i] && i_first == -1) {
9213
- i_first = (int32_t) i;
9214
- }
9215
- if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
9216
- if (i_first != -1) {
9217
- int i_last = u_batch.logits[i] == 0 ? i : i + 1;
9218
- // extract logits for the range [i_first, i_last)
9219
- // group the requests to minimize the number of calls to the backend
9220
- ggml_backend_tensor_get_async(backend_res, res,
9221
- logits_out + n_vocab*(cur_token + i_first),
9222
- i_first*n_vocab*sizeof(float),
9223
- (i_last - i_first)*n_vocab*sizeof(float));
9224
- i_first = -1;
9225
- }
9226
- }
9227
- #ifndef NDEBUG
9228
- logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
9229
- #endif
9230
- }
9231
- } else if (lctx.logits_all) {
9232
- ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
9233
- #ifndef NDEBUG
9234
- std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
9235
- #endif
9236
- } else {
9237
- if (cur_token + n_tokens >= n_tokens_all) {
9238
- ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
9239
- #ifndef NDEBUG
9240
- logits_valid[0] = true;
9241
- #endif
9242
- }
10354
+ GGML_ASSERT(lctx.logits != nullptr);
10355
+
10356
+ float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
10357
+ const int32_t n_outputs_new = lctx.n_outputs;
10358
+
10359
+ if (n_outputs_new) {
10360
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10361
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
10362
+ ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
9243
10363
  }
9244
10364
  }
9245
10365
 
9246
10366
  // extract embeddings
9247
- if (cparams.embeddings && embd) {
10367
+ if (embd) {
9248
10368
  ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
9249
10369
  GGML_ASSERT(backend_embd != nullptr);
9250
10370
 
@@ -9252,16 +10372,14 @@ static int llama_decode_internal(
9252
10372
  case LLAMA_POOLING_TYPE_NONE:
9253
10373
  {
9254
10374
  // extract token embeddings
9255
- auto & embd_out = lctx.embd;
9256
-
9257
- if (u_batch.logits) {
9258
- //embd_out.resize(n_embd * n_tokens);
9259
- for (uint32_t i = 0; i < n_tokens; i++) {
9260
- if (u_batch.logits[i] == 0) {
9261
- continue;
9262
- }
9263
- ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
9264
- }
10375
+ GGML_ASSERT(lctx.embd != nullptr);
10376
+ float * embd_out = lctx.embd + n_outputs_prev*n_embd;
10377
+ const int32_t n_outputs_new = lctx.n_outputs;
10378
+
10379
+ if (n_outputs_new) {
10380
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10381
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
10382
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
9265
10383
  }
9266
10384
  } break;
9267
10385
  case LLAMA_POOLING_TYPE_CLS:
@@ -9288,6 +10406,7 @@ static int llama_decode_internal(
9288
10406
  } break;
9289
10407
  }
9290
10408
  }
10409
+ n_outputs_prev += lctx.n_outputs;
9291
10410
  }
9292
10411
 
9293
10412
  // wait for the computation to finish (automatically done when obtaining the model output)
@@ -10218,7 +11337,7 @@ struct llm_tokenizer_wpm {
10218
11337
  if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
10219
11338
  continue;
10220
11339
  }
10221
- code = to_lower(code);
11340
+ code = unicode_tolower(code);
10222
11341
  if (type == CODEPOINT_TYPE_WHITESPACE) {
10223
11342
  code = ' ';
10224
11343
  }
@@ -10238,7 +11357,7 @@ struct llm_tokenizer_wpm {
10238
11357
  std::vector<std::string> words;
10239
11358
  while (r < new_str.size()) {
10240
11359
  // if is whitespace
10241
- if (isspace(new_str[r])) {
11360
+ if (isspace(new_str[r], std::locale::classic())) {
10242
11361
  if (r > l) words.push_back(new_str.substr(l, (r - l)));
10243
11362
  l = r + 1;
10244
11363
  r = l;
@@ -10252,18 +11371,12 @@ struct llm_tokenizer_wpm {
10252
11371
  return words;
10253
11372
  }
10254
11373
 
10255
- uint32_t to_lower(uint32_t code) {
10256
- static const std::locale locale("en_US.UTF-8");
10257
- #if defined(_WIN32)
10258
- if (code > 0xFFFF) {
10259
- return code;
10260
- }
10261
- #endif
10262
- return std::tolower(wchar_t(code), locale);
10263
- }
10264
-
10265
11374
  bool is_ascii_punct(uint32_t code) {
10266
- return code < 256 && ispunct(code);
11375
+ if (code > 0xFF) {
11376
+ return false;
11377
+ }
11378
+ auto c = char(static_cast<unsigned char>(code));
11379
+ return ispunct(c, std::locale::classic());
10267
11380
  }
10268
11381
 
10269
11382
  bool is_chinese_char(uint32_t cpt) {
@@ -10508,28 +11621,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
10508
11621
  // grammar - internal
10509
11622
  //
10510
11623
 
10511
- struct llama_partial_utf8 {
10512
- uint32_t value; // bit value so far (unshifted)
10513
- int n_remain; // num bytes remaining; -1 indicates invalid sequence
10514
- };
10515
-
10516
- struct llama_grammar {
10517
- const std::vector<std::vector<llama_grammar_element>> rules;
10518
- std::vector<std::vector<const llama_grammar_element *>> stacks;
10519
-
10520
- // buffer for partially generated UTF-8 sequence from accepted tokens
10521
- llama_partial_utf8 partial_utf8;
10522
- };
10523
-
10524
- struct llama_grammar_candidate {
10525
- size_t index;
10526
- const uint32_t * code_points;
10527
- llama_partial_utf8 partial_utf8;
10528
- };
10529
11624
 
10530
11625
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
10531
11626
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
10532
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
11627
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
10533
11628
  const std::string & src,
10534
11629
  llama_partial_utf8 partial_start) {
10535
11630
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -10731,7 +11826,7 @@ static void llama_grammar_advance_stack(
10731
11826
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
10732
11827
  // produces the N possible stacks if the given char is accepted at those
10733
11828
  // positions
10734
- static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11829
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
10735
11830
  const std::vector<std::vector<llama_grammar_element>> & rules,
10736
11831
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
10737
11832
  const uint32_t chr) {
@@ -11957,7 +13052,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11957
13052
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
11958
13053
  // for getting the current layer as I initially thought, and we need to resort to parsing the
11959
13054
  // tensor name.
11960
- n_layer /= n_expert;
11961
13055
  if (sscanf(name, "blk.%d.", &i_layer) != 1) {
11962
13056
  throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
11963
13057
  }
@@ -11971,30 +13065,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11971
13065
  // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
11972
13066
  // with the quantization of the output tensor
11973
13067
  if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
11974
- int nx = tensor->ne[0];
11975
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
11976
- new_type = GGML_TYPE_Q8_0;
11977
- }
11978
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11979
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11980
- new_type = GGML_TYPE_Q5_K;
11981
- }
11982
- else if (new_type != GGML_TYPE_Q8_0) {
11983
- new_type = GGML_TYPE_Q6_K;
13068
+ if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
13069
+ new_type = qs.params->output_tensor_type;
13070
+ } else {
13071
+ int nx = tensor->ne[0];
13072
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
13073
+ new_type = GGML_TYPE_Q8_0;
13074
+ }
13075
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
13076
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
13077
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13078
+ new_type = GGML_TYPE_Q5_K;
13079
+ }
13080
+ else if (new_type != GGML_TYPE_Q8_0) {
13081
+ new_type = GGML_TYPE_Q6_K;
13082
+ }
11984
13083
  }
11985
13084
  } else if (name == "token_embd.weight") {
11986
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
11987
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11988
- new_type = GGML_TYPE_Q2_K;
11989
- }
11990
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11991
- new_type = GGML_TYPE_IQ3_S;
11992
- }
11993
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11994
- new_type = GGML_TYPE_IQ3_S;
13085
+ if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
13086
+ new_type = qs.params->token_embedding_type;
13087
+ } else {
13088
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
13089
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13090
+ new_type = GGML_TYPE_Q2_K;
13091
+ }
13092
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13093
+ new_type = GGML_TYPE_IQ3_S;
13094
+ }
13095
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
13096
+ new_type = GGML_TYPE_IQ3_S;
13097
+ }
11995
13098
  }
11996
13099
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
11997
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13100
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
11998
13101
  if (name.find("attn_v.weight") != std::string::npos) {
11999
13102
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
12000
13103
  else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -12013,7 +13116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12013
13116
  if (qs.model.hparams.n_expert == 8) {
12014
13117
  new_type = GGML_TYPE_Q5_K;
12015
13118
  } else {
12016
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
13119
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
12017
13120
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
12018
13121
  }
12019
13122
  }
@@ -12027,13 +13130,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12027
13130
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
12028
13131
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
12029
13132
  }
12030
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
12031
- new_type = GGML_TYPE_Q4_K;
12032
- }
12033
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
12034
- new_type = GGML_TYPE_Q4_K;
12035
- }
12036
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
13133
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
12037
13134
  new_type = GGML_TYPE_Q4_K;
12038
13135
  }
12039
13136
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
@@ -12186,7 +13283,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12186
13283
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
12187
13284
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
12188
13285
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
12189
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
13286
+ new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
13287
+ new_type == GGML_TYPE_IQ1_M) {
12190
13288
  int nx = tensor->ne[0];
12191
13289
  int ny = tensor->ne[1];
12192
13290
  if (nx % QK_K != 0) {
@@ -12204,6 +13302,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12204
13302
  case GGML_TYPE_IQ3_XXS:
12205
13303
  case GGML_TYPE_IQ3_S:
12206
13304
  case GGML_TYPE_IQ1_S:
13305
+ case GGML_TYPE_IQ1_M:
12207
13306
  case GGML_TYPE_Q2_K:
12208
13307
  case GGML_TYPE_Q3_K:
12209
13308
  case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
@@ -12285,6 +13384,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12285
13384
  case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
12286
13385
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
12287
13386
  case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
13387
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
12288
13388
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
12289
13389
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
12290
13390
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
@@ -12307,8 +13407,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12307
13407
  constexpr bool use_mmap = false;
12308
13408
  #endif
12309
13409
 
12310
- llama_model_loader ml(fname_inp, use_mmap, NULL);
12311
- ml.init_mapping(false); // no prefetching?
13410
+ llama_model_kv_override * kv_overrides = nullptr;
13411
+ if (params->kv_overrides) {
13412
+ auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
13413
+ kv_overrides = v->data();
13414
+ }
13415
+ llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
13416
+ ml.init_mappings(false); // no prefetching
12312
13417
 
12313
13418
  llama_model model;
12314
13419
  llm_load_arch(ml, model);
@@ -12332,36 +13437,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12332
13437
  struct gguf_context * ctx_out = gguf_init_empty();
12333
13438
 
12334
13439
  // copy the KV pairs from the input file
12335
- gguf_set_kv (ctx_out, ml.ctx_gguf);
13440
+ gguf_set_kv (ctx_out, ml.meta);
12336
13441
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
12337
13442
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
12338
13443
 
13444
+ if (params->kv_overrides) {
13445
+ const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
13446
+ for (auto & o : overrides) {
13447
+ if (o.key[0] == 0) break;
13448
+ if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
13449
+ gguf_set_val_f32(ctx_out, o.key, o.float_value);
13450
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
13451
+ gguf_set_val_i32(ctx_out, o.key, o.int_value);
13452
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
13453
+ gguf_set_val_bool(ctx_out, o.key, o.bool_value);
13454
+ } else {
13455
+ LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
13456
+ }
13457
+ }
13458
+ }
13459
+
12339
13460
  for (int i = 0; i < ml.n_tensors; ++i) {
12340
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13461
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12341
13462
 
12342
13463
  const std::string name = ggml_get_name(meta);
12343
13464
 
12344
13465
  // TODO: avoid hardcoded tensor names - use the TN_* constants
12345
13466
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
12346
13467
  ++qs.n_attention_wv;
12347
- }
12348
- else if (name.find("ffn_down") != std::string::npos) {
12349
- ++qs.n_ffn_down;
12350
- }
12351
- else if (name.find("ffn_gate") != std::string::npos) {
12352
- ++qs.n_ffn_gate;
12353
- }
12354
- else if (name.find("ffn_up") != std::string::npos) {
12355
- ++qs.n_ffn_up;
12356
- }
12357
- else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13468
+ } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
12358
13469
  qs.has_output = true;
12359
13470
  }
12360
13471
  }
12361
- if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
12362
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
12363
- __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
12364
- }
13472
+
13473
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13474
+
13475
+ // sanity checks
13476
+ GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
12365
13477
 
12366
13478
  size_t total_size_org = 0;
12367
13479
  size_t total_size_new = 0;
@@ -12377,7 +13489,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12377
13489
 
12378
13490
  // populate the original tensors so we get an initial meta data
12379
13491
  for (int i = 0; i < ml.n_tensors; ++i) {
12380
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13492
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12381
13493
  gguf_add_tensor(ctx_out, meta);
12382
13494
  }
12383
13495
 
@@ -12391,6 +13503,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12391
13503
  // placeholder for the meta data
12392
13504
  ::zeros(fout, meta_size);
12393
13505
 
13506
+ const auto tn = LLM_TN(model.arch);
13507
+
12394
13508
  for (int i = 0; i < ml.n_tensors; ++i) {
12395
13509
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
12396
13510
 
@@ -12413,8 +13527,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12413
13527
  // This used to be a regex, but <regex> has an extreme cost to compile times.
12414
13528
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
12415
13529
 
12416
- // quantize only 2D tensors
12417
- quantize &= (ggml_n_dims(tensor) == 2);
13530
+ // quantize only 2D and 3D tensors (experts)
13531
+ quantize &= (ggml_n_dims(tensor) >= 2);
12418
13532
  quantize &= params->quantize_output_tensor || name != "output.weight";
12419
13533
  quantize &= !params->only_copy;
12420
13534
 
@@ -12443,6 +13557,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12443
13557
  if (!params->pure && ggml_is_quantized(default_type)) {
12444
13558
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
12445
13559
  }
13560
+ else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13561
+ new_type = params->token_embedding_type;
13562
+ }
13563
+ else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13564
+ new_type = params->output_tensor_type;
13565
+ }
12446
13566
 
12447
13567
  // If we've decided to quantize to the same type the tensor is already
12448
13568
  // in then there's nothing to do.
@@ -12463,11 +13583,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12463
13583
  if (it == imatrix_data->end()) {
12464
13584
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
12465
13585
  } else {
12466
- if (it->second.size() == (size_t)tensor->ne[0]) {
13586
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
12467
13587
  imatrix = it->second.data();
12468
13588
  } else {
12469
13589
  LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
12470
- int(it->second.size()), int(tensor->ne[0]), tensor->name);
13590
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
13591
+
13592
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
13593
+ // this is a significant error and it may be good idea to abort the process if this happens,
13594
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
13595
+ // tok_embd should be ignored in this case, since it always causes this warning
13596
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
13597
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
13598
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
13599
+ }
12471
13600
  }
12472
13601
  }
12473
13602
  }
@@ -12475,6 +13604,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12475
13604
  new_type == GGML_TYPE_IQ2_XS ||
12476
13605
  new_type == GGML_TYPE_IQ2_S ||
12477
13606
  new_type == GGML_TYPE_IQ1_S ||
13607
+ (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
12478
13608
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
12479
13609
  LLAMA_LOG_ERROR("\n\n============================================================\n");
12480
13610
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -12503,15 +13633,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12503
13633
  new_data = work.data();
12504
13634
 
12505
13635
  const int n_per_row = tensor->ne[0];
12506
- const int nrows = nelements / n_per_row;
13636
+ const int nrows = tensor->ne[1];
12507
13637
 
12508
13638
  static const int min_chunk_size = 32 * 512;
12509
13639
  const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
12510
13640
 
12511
- const int nchunk = (nelements + chunk_size - 1)/chunk_size;
13641
+ const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13642
+ const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
12512
13643
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
12513
- new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
12514
13644
 
13645
+ // quantize each expert separately since they have different importance matrices
13646
+ new_size = 0;
13647
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
13648
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
13649
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
13650
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
13651
+
13652
+ new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
13653
+ }
12515
13654
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
12516
13655
  }
12517
13656
  total_size_org += ggml_nbytes(tensor);
@@ -12582,7 +13721,7 @@ static int llama_apply_lora_from_file_internal(
12582
13721
  if (path_base_model) {
12583
13722
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
12584
13723
  ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
12585
- ml->init_mapping(/*prefetch*/ false); // no prefetching
13724
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
12586
13725
  }
12587
13726
 
12588
13727
  struct tensor_meta {
@@ -12703,7 +13842,7 @@ static int llama_apply_lora_from_file_internal(
12703
13842
 
12704
13843
  ggml_tensor * base_t;
12705
13844
  if (ml) {
12706
- if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
13845
+ if (!ml->get_tensor_meta(base_name.c_str())) {
12707
13846
  LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
12708
13847
  return 1;
12709
13848
  }
@@ -12887,11 +14026,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12887
14026
  struct llama_model_quantize_params result = {
12888
14027
  /*.nthread =*/ 0,
12889
14028
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
14029
+ /*.output_tensor_type =*/ GGML_TYPE_COUNT,
14030
+ /*.token_embedding_type =*/ GGML_TYPE_COUNT,
12890
14031
  /*.allow_requantize =*/ false,
12891
14032
  /*.quantize_output_tensor =*/ true,
12892
14033
  /*.only_copy =*/ false,
12893
14034
  /*.pure =*/ false,
12894
14035
  /*.imatrix =*/ nullptr,
14036
+ /*.kv_overrides =*/ nullptr,
12895
14037
  };
12896
14038
 
12897
14039
  return result;
@@ -12900,7 +14042,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12900
14042
  size_t llama_max_devices(void) {
12901
14043
  #if defined(GGML_USE_METAL)
12902
14044
  return 1;
12903
- #elif defined(GGML_USE_CUBLAS)
14045
+ #elif defined(GGML_USE_CUDA)
12904
14046
  return GGML_CUDA_MAX_DEVICES;
12905
14047
  #elif defined(GGML_USE_SYCL)
12906
14048
  return GGML_SYCL_MAX_DEVICES;
@@ -12920,8 +14062,8 @@ bool llama_supports_mlock(void) {
12920
14062
  }
12921
14063
 
12922
14064
  bool llama_supports_gpu_offload(void) {
12923
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
12924
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
14065
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
14066
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
12925
14067
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
12926
14068
  return true;
12927
14069
  #else
@@ -13028,7 +14170,7 @@ struct llama_context * llama_new_context_with_model(
13028
14170
  const auto & hparams = model->hparams;
13029
14171
  auto & cparams = ctx->cparams;
13030
14172
 
13031
- // TODO: maybe add n_seq_max here too
14173
+ cparams.n_seq_max = std::max(1u, params.n_seq_max);
13032
14174
  cparams.n_threads = params.n_threads;
13033
14175
  cparams.n_threads_batch = params.n_threads_batch;
13034
14176
  cparams.yarn_ext_factor = params.yarn_ext_factor;
@@ -13126,7 +14268,7 @@ struct llama_context * llama_new_context_with_model(
13126
14268
  }
13127
14269
  ctx->backends.push_back(ctx->backend_metal);
13128
14270
  }
13129
- #elif defined(GGML_USE_CUBLAS)
14271
+ #elif defined(GGML_USE_CUDA)
13130
14272
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
13131
14273
  // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
13132
14274
  ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
@@ -13149,7 +14291,20 @@ struct llama_context * llama_new_context_with_model(
13149
14291
  }
13150
14292
  }
13151
14293
  #elif defined(GGML_USE_VULKAN)
13152
- if (model->n_gpu_layers > 0) {
14294
+ if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14295
+ LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
14296
+ llama_free(ctx);
14297
+ return nullptr;
14298
+ }
14299
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
14300
+ ggml_backend_t backend = ggml_backend_vk_init(0);
14301
+ if (backend == nullptr) {
14302
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
14303
+ llama_free(ctx);
14304
+ return nullptr;
14305
+ }
14306
+ ctx->backends.push_back(backend);
14307
+ } else {
13153
14308
  for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
13154
14309
  ggml_backend_t backend = ggml_backend_vk_init(device);
13155
14310
  if (backend == nullptr) {
@@ -13161,30 +14316,28 @@ struct llama_context * llama_new_context_with_model(
13161
14316
  }
13162
14317
  }
13163
14318
  #elif defined(GGML_USE_SYCL)
13164
- if (model->n_gpu_layers > 0) {
13165
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
13166
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
13167
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
14319
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
14320
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14321
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
14322
+ if (backend == nullptr) {
14323
+ int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
14324
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
14325
+ llama_free(ctx);
14326
+ return nullptr;
14327
+ }
14328
+ ctx->backends.push_back(backend);
14329
+ } else {
14330
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
14331
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
14332
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
13168
14333
  if (backend == nullptr) {
13169
- int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
13170
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
14334
+ int id_list[GGML_SYCL_MAX_DEVICES];
14335
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
14336
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
13171
14337
  llama_free(ctx);
13172
14338
  return nullptr;
13173
14339
  }
13174
14340
  ctx->backends.push_back(backend);
13175
- } else {
13176
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
13177
- for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
13178
- ggml_backend_t backend = ggml_backend_sycl_init(i);
13179
- if (backend == nullptr) {
13180
- int id_list[GGML_SYCL_MAX_DEVICES];
13181
- ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
13182
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
13183
- llama_free(ctx);
13184
- return nullptr;
13185
- }
13186
- ctx->backends.push_back(backend);
13187
- }
13188
14341
  }
13189
14342
  }
13190
14343
  #elif defined(GGML_USE_KOMPUTE)
@@ -13232,25 +14385,12 @@ struct llama_context * llama_new_context_with_model(
13232
14385
 
13233
14386
  // graph outputs buffer
13234
14387
  {
13235
- // resized during inference, reserve maximum
13236
- ctx->logits_size = hparams.n_vocab*cparams.n_batch;
13237
- ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
13238
-
13239
- const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
13240
-
13241
- ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
13242
- if (ctx->buf_output == nullptr) {
13243
- LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
14388
+ // resized during inference when a batch uses more outputs
14389
+ if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
14390
+ LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
13244
14391
  llama_free(ctx);
13245
14392
  return nullptr;
13246
14393
  }
13247
- ggml_backend_buffer_clear(ctx->buf_output, 0);
13248
-
13249
-
13250
- ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
13251
- if (params.embeddings) {
13252
- ctx->embd = ctx->logits + ctx->logits_size;
13253
- }
13254
14394
 
13255
14395
  LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
13256
14396
  ggml_backend_buffer_name(ctx->buf_output),
@@ -13275,7 +14415,7 @@ struct llama_context * llama_new_context_with_model(
13275
14415
 
13276
14416
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
13277
14417
  bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
13278
- #ifndef GGML_USE_CUBLAS
14418
+ #ifndef GGML_USE_CUDA
13279
14419
  // pipeline parallelism requires support for async compute and events
13280
14420
  // currently this is only implemented in the CUDA backend
13281
14421
  pipeline_parallel = false;
@@ -13383,11 +14523,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
13383
14523
  case LLM_ARCH_ORION:
13384
14524
  case LLM_ARCH_INTERNLM2:
13385
14525
  case LLM_ARCH_MINICPM:
14526
+ case LLM_ARCH_XVERSE:
13386
14527
  case LLM_ARCH_COMMAND_R:
13387
14528
  return LLAMA_ROPE_TYPE_NORM;
13388
14529
 
13389
14530
  // the pairs of head values are offset by n_rot/2
13390
14531
  case LLM_ARCH_FALCON:
14532
+ case LLM_ARCH_GROK:
13391
14533
  case LLM_ARCH_PERSIMMON:
13392
14534
  case LLM_ARCH_BERT:
13393
14535
  case LLM_ARCH_NOMIC_BERT:
@@ -13766,27 +14908,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
13766
14908
 
13767
14909
  // Returns the *maximum* size of the state
13768
14910
  size_t llama_get_state_size(const struct llama_context * ctx) {
14911
+ const auto & cparams = ctx->cparams;
14912
+ const auto & hparams = ctx->model.hparams;
14913
+
13769
14914
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
13770
14915
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
13771
14916
  const size_t s_rng_size = sizeof(size_t);
13772
14917
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
14918
+ const size_t s_n_outputs = sizeof(size_t);
14919
+ // assume worst case for outputs although only currently set ones are serialized
14920
+ const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
13773
14921
  const size_t s_logits_size = sizeof(size_t);
13774
- // assume worst case for logits although only currently set ones are serialized
13775
- const size_t s_logits = ctx->logits_size * sizeof(float);
14922
+ const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
13776
14923
  const size_t s_embedding_size = sizeof(size_t);
13777
- const size_t s_embedding = ctx->embd_size * sizeof(float);
14924
+ const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
13778
14925
  const size_t s_kv_buf_size = sizeof(size_t);
13779
14926
  const size_t s_kv_head = sizeof(uint32_t);
13780
14927
  const size_t s_kv_size = sizeof(uint32_t);
13781
14928
  const size_t s_kv_used = sizeof(uint32_t);
13782
14929
  const size_t s_kv = ctx->kv_self.total_size();
13783
- // TODO: assume the max is more than 1 seq_id per KV cell
13784
- const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
14930
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
13785
14931
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
13786
14932
 
13787
14933
  const size_t s_total = (
13788
14934
  + s_rng_size
13789
14935
  + s_rng
14936
+ + s_n_outputs
14937
+ + s_output_pos
13790
14938
  + s_logits_size
13791
14939
  + s_logits
13792
14940
  + s_embedding_size
@@ -13861,7 +15009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13861
15009
  std::ostringstream rng_ss;
13862
15010
  rng_ss << ctx->rng;
13863
15011
 
13864
- const std::string & rng_str = rng_ss.str();
15012
+ const std::string & rng_str = rng_ss.str();
13865
15013
  const size_t rng_size = rng_str.size();
13866
15014
 
13867
15015
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
@@ -13870,25 +15018,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13870
15018
  data_ctx->write(rng_str.data(), rng_size);
13871
15019
  }
13872
15020
 
13873
- // copy logits
15021
+ // copy outputs
13874
15022
  {
13875
- const size_t logits_size = ctx->logits_size;
15023
+ // Can't use ctx->n_outputs because it's not for the
15024
+ // entire last batch when n_ubatch is smaller than n_batch
15025
+ size_t n_outputs = 0;
13876
15026
 
13877
- data_ctx->write(&logits_size, sizeof(logits_size));
15027
+ // copy output ids
15028
+ {
15029
+ std::vector<int32_t> output_pos;
13878
15030
 
13879
- if (logits_size) {
13880
- data_ctx->write(ctx->logits, logits_size * sizeof(float));
15031
+ const size_t n_batch = ctx->cparams.n_batch;
15032
+ const auto & output_ids = ctx->output_ids;
15033
+
15034
+ output_pos.resize(ctx->output_size);
15035
+
15036
+ // build a more compact representation of the output ids
15037
+ for (size_t i = 0; i < n_batch; ++i) {
15038
+ // map an output id to a position in the batch
15039
+ int32_t pos = output_ids[i];
15040
+ if (pos >= 0) {
15041
+ if ((size_t) pos >= n_outputs) {
15042
+ n_outputs = pos + 1;
15043
+ }
15044
+ GGML_ASSERT((size_t) pos < ctx->output_size);
15045
+ output_pos[pos] = i;
15046
+ }
15047
+ }
15048
+
15049
+ data_ctx->write(&n_outputs, sizeof(n_outputs));
15050
+
15051
+ if (n_outputs) {
15052
+ data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
15053
+ }
13881
15054
  }
13882
- }
13883
15055
 
13884
- // copy embeddings
13885
- {
13886
- const size_t embeddings_size = ctx->embd_size;
15056
+ // copy logits
15057
+ {
15058
+ const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
13887
15059
 
13888
- data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15060
+ data_ctx->write(&logits_size, sizeof(logits_size));
13889
15061
 
13890
- if (embeddings_size) {
13891
- data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15062
+ if (logits_size) {
15063
+ data_ctx->write(ctx->logits, logits_size * sizeof(float));
15064
+ }
15065
+ }
15066
+
15067
+ // copy embeddings
15068
+ {
15069
+ const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
15070
+
15071
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15072
+
15073
+ if (embeddings_size) {
15074
+ data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15075
+ }
13892
15076
  }
13893
15077
  }
13894
15078
 
@@ -13901,9 +15085,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13901
15085
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13902
15086
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
13903
15087
 
13904
- const size_t kv_buf_size = kv_self.total_size();
15088
+ // NOTE: kv_size and kv_buf_size are mostly used for sanity checks
13905
15089
  const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
13906
15090
  const uint32_t kv_size = kv_self.size;
15091
+ const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
13907
15092
  const uint32_t kv_used = kv_self.used;
13908
15093
 
13909
15094
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
@@ -13912,6 +15097,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13912
15097
  data_ctx->write(&kv_used, sizeof(kv_used));
13913
15098
 
13914
15099
  if (kv_buf_size) {
15100
+ const size_t pre_kv_buf_size = data_ctx->get_size_written();
15101
+
13915
15102
  std::vector<uint8_t> tmp_buf;
13916
15103
  for (int il = 0; il < (int) n_layer; ++il) {
13917
15104
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -13941,6 +15128,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13941
15128
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
13942
15129
  }
13943
15130
  }
15131
+ GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
13944
15132
  }
13945
15133
 
13946
15134
  for (uint32_t i = 0; i < kv_head; ++i) {
@@ -13985,6 +15173,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13985
15173
  GGML_ASSERT(!rng_ss.fail());
13986
15174
  }
13987
15175
 
15176
+ // set output ids
15177
+ {
15178
+ size_t n_outputs;
15179
+ std::vector<int32_t> output_pos;
15180
+
15181
+ memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
15182
+
15183
+ GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
15184
+
15185
+ if (n_outputs) {
15186
+ output_pos.resize(n_outputs);
15187
+ memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
15188
+ inp += n_outputs * sizeof(int32_t);
15189
+
15190
+ for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
15191
+ int32_t id = output_pos[i];
15192
+ GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15193
+ ctx->output_ids[id] = i;
15194
+ }
15195
+ }
15196
+ }
15197
+
13988
15198
  // set logits
13989
15199
  {
13990
15200
  size_t logits_size;
@@ -14005,7 +15215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14005
15215
 
14006
15216
  memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
14007
15217
 
14008
- GGML_ASSERT(ctx->embd_size == embeddings_size);
15218
+ GGML_ASSERT(ctx->embd_size >= embeddings_size);
14009
15219
 
14010
15220
  if (embeddings_size) {
14011
15221
  memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
@@ -14032,8 +15242,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14032
15242
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
14033
15243
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
14034
15244
 
15245
+ if (kv_self.size != kv_size) {
15246
+ // the KV cache needs to be big enough to load all the KV cells from the saved state
15247
+ GGML_ASSERT(kv_self.size >= kv_head);
15248
+
15249
+ LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
15250
+ __func__, kv_head, kv_size, kv_self.size);
15251
+ }
15252
+
14035
15253
  if (kv_buf_size) {
14036
- GGML_ASSERT(kv_self.total_size() == kv_buf_size);
15254
+ const size_t pre_kv_buf_size = inp - src;
15255
+
15256
+ GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
14037
15257
 
14038
15258
  for (int il = 0; il < (int) n_layer; ++il) {
14039
15259
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -14053,23 +15273,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14053
15273
 
14054
15274
  // v is not contiguous, copy row by row
14055
15275
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
14056
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
15276
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
14057
15277
 
14058
15278
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
14059
15279
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
14060
15280
  inp += v_row_size;
14061
15281
  }
14062
15282
  }
15283
+ GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
14063
15284
  }
14064
15285
 
14065
- GGML_ASSERT(kv_self.size == kv_size);
15286
+ llama_kv_cache_clear(ctx);
14066
15287
 
14067
15288
  ctx->kv_self.head = kv_head;
14068
- ctx->kv_self.size = kv_size;
14069
15289
  ctx->kv_self.used = kv_used;
14070
15290
 
14071
- ctx->kv_self.cells.resize(kv_size);
14072
-
14073
15291
  for (uint32_t i = 0; i < kv_head; ++i) {
14074
15292
  llama_pos pos;
14075
15293
  size_t seq_id_size;
@@ -14086,11 +15304,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
14086
15304
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
14087
15305
  }
14088
15306
  }
14089
-
14090
- for (uint32_t i = kv_head; i < kv_size; ++i) {
14091
- ctx->kv_self.cells[i].pos = -1;
14092
- ctx->kv_self.cells[i].seq_id.clear();
14093
- }
14094
15307
  }
14095
15308
 
14096
15309
  const size_t nread = inp - src;
@@ -14296,11 +15509,33 @@ float * llama_get_logits(struct llama_context * ctx) {
14296
15509
  }
14297
15510
 
14298
15511
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
14299
- assert(ctx->logits_valid.at(i));
14300
-
14301
15512
  llama_synchronize(ctx);
14302
15513
 
14303
- return ctx->logits + i*ctx->model.hparams.n_vocab;
15514
+ try {
15515
+ if (ctx->logits == nullptr) {
15516
+ throw std::runtime_error("no logits");
15517
+ }
15518
+ if ((size_t) i >= ctx->output_ids.size()) {
15519
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
15520
+ }
15521
+ const int32_t j = ctx->output_ids[i];
15522
+
15523
+ if (j < 0) {
15524
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
15525
+ }
15526
+ if ((size_t) j >= ctx->output_size) {
15527
+ // This should not happen
15528
+ throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
15529
+ }
15530
+
15531
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
15532
+ } catch (const std::exception & err) {
15533
+ LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
15534
+ #ifndef NDEBUG
15535
+ GGML_ASSERT(false);
15536
+ #endif
15537
+ return nullptr;
15538
+ }
14304
15539
  }
14305
15540
 
14306
15541
  float * llama_get_embeddings(struct llama_context * ctx) {
@@ -14312,7 +15547,31 @@ float * llama_get_embeddings(struct llama_context * ctx) {
14312
15547
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
14313
15548
  llama_synchronize(ctx);
14314
15549
 
14315
- return ctx->embd + i*ctx->model.hparams.n_embd;
15550
+ try {
15551
+ if (ctx->embd == nullptr) {
15552
+ throw std::runtime_error("no embeddings");
15553
+ }
15554
+ if ((size_t) i >= ctx->output_ids.size()) {
15555
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
15556
+ }
15557
+ const int32_t j = ctx->output_ids[i];
15558
+
15559
+ if (j < 0) {
15560
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
15561
+ }
15562
+ if ((size_t) j >= ctx->output_size) {
15563
+ // This should not happen
15564
+ throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
15565
+ }
15566
+
15567
+ return ctx->embd + j*ctx->model.hparams.n_embd;
15568
+ } catch (const std::exception & err) {
15569
+ LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
15570
+ #ifndef NDEBUG
15571
+ GGML_ASSERT(false);
15572
+ #endif
15573
+ return nullptr;
15574
+ }
14316
15575
  }
14317
15576
 
14318
15577
  float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
@@ -14602,6 +15861,55 @@ static int32_t llama_chat_apply_template_internal(
14602
15861
  ss << message->content << "</s>";
14603
15862
  }
14604
15863
  }
15864
+ } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
15865
+ // openchat/openchat-3.5-0106,
15866
+ for (auto message : chat) {
15867
+ std::string role(message->role);
15868
+ if (role == "system") {
15869
+ ss << message->content << "<|end_of_turn|>";
15870
+ } else {
15871
+ role[0] = toupper(role[0]);
15872
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
15873
+ }
15874
+ }
15875
+ if (add_ass) {
15876
+ ss << "GPT4 Correct Assistant:";
15877
+ }
15878
+ } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
15879
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
15880
+ for (auto message : chat) {
15881
+ std::string role(message->role);
15882
+ if (role == "system") {
15883
+ // Orca-Vicuna variant uses a system prefix
15884
+ if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
15885
+ ss << "SYSTEM: " << message->content << "\n";
15886
+ } else {
15887
+ ss << message->content << "\n\n";
15888
+ }
15889
+ } else if (role == "user") {
15890
+ ss << "USER: " << message->content << "\n";
15891
+ } else if (role == "assistant") {
15892
+ ss << "ASSISTANT: " << message->content << "</s>\n";
15893
+ }
15894
+ }
15895
+ if (add_ass) {
15896
+ ss << "ASSISTANT:";
15897
+ }
15898
+ } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
15899
+ // deepseek-ai/deepseek-coder-33b-instruct
15900
+ for (auto message : chat) {
15901
+ std::string role(message->role);
15902
+ if (role == "system") {
15903
+ ss << message->content;
15904
+ } else if (role == "user") {
15905
+ ss << "### Instruction:\n" << message->content << "\n";
15906
+ } else if (role == "assistant") {
15907
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
15908
+ }
15909
+ }
15910
+ if (add_ass) {
15911
+ ss << "### Response:\n";
15912
+ }
14605
15913
  } else {
14606
15914
  // template not supported
14607
15915
  return -1;
@@ -14651,6 +15959,30 @@ LLAMA_API int32_t llama_chat_apply_template(
14651
15959
  return res;
14652
15960
  }
14653
15961
 
15962
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
15963
+ static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
15964
+ if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
15965
+ return strlen(split_path);
15966
+ }
15967
+ return 0;
15968
+ }
15969
+
15970
+ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
15971
+ std::string str_split_path(split_path);
15972
+ char postfix[32];
15973
+ snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
15974
+ std::string str_postfix(postfix);
15975
+
15976
+ // check if dest ends with postfix
15977
+ int size_prefix = str_split_path.size() - str_postfix.size();
15978
+ if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
15979
+ snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
15980
+ return size_prefix;
15981
+ }
15982
+
15983
+ return 0;
15984
+ }
15985
+
14654
15986
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
14655
15987
  struct llama_timings result = {
14656
15988
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,