llama_cpp 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
- #ifdef GGML_USE_CUBLAS
10
+ #ifdef GGML_USE_CUDA
11
11
  # include "ggml-cuda.h"
12
12
  #elif defined(GGML_USE_CLBLAST)
13
13
  # include "ggml-opencl.h"
@@ -52,12 +52,16 @@
52
52
  #define NOMINMAX
53
53
  #endif
54
54
  #include <windows.h>
55
+ #ifndef PATH_MAX
56
+ #define PATH_MAX MAX_PATH
57
+ #endif
55
58
  #include <io.h>
56
59
  #endif
57
60
 
58
61
  #include <algorithm>
59
62
  #include <array>
60
63
  #include <cassert>
64
+ #include <cctype>
61
65
  #include <cfloat>
62
66
  #include <cinttypes>
63
67
  #include <climits>
@@ -68,7 +72,6 @@
68
72
  #include <cstdio>
69
73
  #include <cstring>
70
74
  #include <ctime>
71
- #include <cwctype>
72
75
  #include <forward_list>
73
76
  #include <fstream>
74
77
  #include <functional>
@@ -192,6 +195,7 @@ enum llm_arch {
192
195
  LLM_ARCH_LLAMA,
193
196
  LLM_ARCH_FALCON,
194
197
  LLM_ARCH_BAICHUAN,
198
+ LLM_ARCH_GROK,
195
199
  LLM_ARCH_GPT2,
196
200
  LLM_ARCH_GPTJ,
197
201
  LLM_ARCH_GPTNEOX,
@@ -214,12 +218,15 @@ enum llm_arch {
214
218
  LLM_ARCH_GEMMA,
215
219
  LLM_ARCH_STARCODER2,
216
220
  LLM_ARCH_MAMBA,
221
+ LLM_ARCH_XVERSE,
222
+ LLM_ARCH_COMMAND_R,
217
223
  LLM_ARCH_UNKNOWN,
218
224
  };
219
225
 
220
226
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
221
227
  { LLM_ARCH_LLAMA, "llama" },
222
228
  { LLM_ARCH_FALCON, "falcon" },
229
+ { LLM_ARCH_GROK, "grok" },
223
230
  { LLM_ARCH_GPT2, "gpt2" },
224
231
  { LLM_ARCH_GPTJ, "gptj" },
225
232
  { LLM_ARCH_GPTNEOX, "gptneox" },
@@ -243,6 +250,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
243
250
  { LLM_ARCH_GEMMA, "gemma" },
244
251
  { LLM_ARCH_STARCODER2, "starcoder2" },
245
252
  { LLM_ARCH_MAMBA, "mamba" },
253
+ { LLM_ARCH_XVERSE, "xverse" },
254
+ { LLM_ARCH_COMMAND_R, "command-r" },
246
255
  { LLM_ARCH_UNKNOWN, "(unknown)" },
247
256
  };
248
257
 
@@ -268,6 +277,7 @@ enum llm_kv {
268
277
  LLM_KV_EXPERT_COUNT,
269
278
  LLM_KV_EXPERT_USED_COUNT,
270
279
  LLM_KV_POOLING_TYPE,
280
+ LLM_KV_LOGIT_SCALE,
271
281
 
272
282
  LLM_KV_ATTENTION_HEAD_COUNT,
273
283
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -287,6 +297,10 @@ enum llm_kv {
287
297
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
288
298
  LLM_KV_ROPE_SCALING_FINETUNED,
289
299
 
300
+ LLM_KV_SPLIT_NO,
301
+ LLM_KV_SPLIT_COUNT,
302
+ LLM_KV_SPLIT_TENSORS_COUNT,
303
+
290
304
  LLM_KV_SSM_INNER_SIZE,
291
305
  LLM_KV_SSM_CONV_KERNEL,
292
306
  LLM_KV_SSM_STATE_SIZE,
@@ -332,6 +346,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
332
346
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
333
347
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
334
348
  { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
349
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
335
350
 
336
351
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
337
352
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -351,6 +366,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
351
366
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
352
367
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
353
368
 
369
+ { LLM_KV_SPLIT_NO, "split.no" },
370
+ { LLM_KV_SPLIT_COUNT, "split.count" },
371
+ { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
372
+
354
373
  { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
355
374
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
356
375
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
@@ -407,9 +426,12 @@ enum llm_tensor {
407
426
  LLM_TENSOR_FFN_DOWN,
408
427
  LLM_TENSOR_FFN_UP,
409
428
  LLM_TENSOR_FFN_ACT,
410
- LLM_TENSOR_FFN_DOWN_EXP,
429
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
411
430
  LLM_TENSOR_FFN_GATE_EXP,
412
431
  LLM_TENSOR_FFN_UP_EXP,
432
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
433
+ LLM_TENSOR_FFN_GATE_EXPS,
434
+ LLM_TENSOR_FFN_UP_EXPS,
413
435
  LLM_TENSOR_ATTN_Q_NORM,
414
436
  LLM_TENSOR_ATTN_K_NORM,
415
437
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -444,6 +466,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
444
466
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
445
467
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
446
468
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
469
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
470
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
471
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
447
472
  },
448
473
  },
449
474
  {
@@ -479,6 +504,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
479
504
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
480
505
  },
481
506
  },
507
+ {
508
+ LLM_ARCH_GROK,
509
+ {
510
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
511
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
512
+ { LLM_TENSOR_OUTPUT, "output" },
513
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
514
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
515
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
516
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
517
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
518
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
519
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
520
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
521
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
522
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
523
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
524
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
525
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
526
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
527
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
528
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
529
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
530
+ },
531
+ },
482
532
  {
483
533
  LLM_ARCH_GPT2,
484
534
  {
@@ -536,6 +586,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
536
586
  {
537
587
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
538
588
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
589
+ { LLM_TENSOR_OUTPUT, "output"},
539
590
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
540
591
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
541
592
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -543,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
543
594
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
544
595
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
545
596
  { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
597
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
598
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
599
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
546
600
  },
547
601
  },
548
602
  {
@@ -838,6 +892,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
838
892
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
839
893
  },
840
894
  },
895
+ {
896
+ LLM_ARCH_XVERSE,
897
+ {
898
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
899
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
900
+ { LLM_TENSOR_OUTPUT, "output" },
901
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
902
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
903
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
904
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
905
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
906
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
907
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
908
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
909
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
910
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
911
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
912
+ },
913
+ },
914
+ {
915
+ LLM_ARCH_COMMAND_R,
916
+ {
917
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
918
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
919
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
920
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
921
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
922
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
923
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
924
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
925
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
926
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
927
+ },
928
+ },
841
929
  {
842
930
  LLM_ARCH_UNKNOWN,
843
931
  {
@@ -1010,7 +1098,7 @@ struct llama_file {
1010
1098
  size_t size;
1011
1099
 
1012
1100
  llama_file(const char * fname, const char * mode) {
1013
- fp = std::fopen(fname, mode);
1101
+ fp = ggml_fopen(fname, mode);
1014
1102
  if (fp == NULL) {
1015
1103
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1016
1104
  }
@@ -1079,6 +1167,7 @@ struct llama_file {
1079
1167
  }
1080
1168
  }
1081
1169
  };
1170
+ using llama_files = std::vector<std::unique_ptr<llama_file>>;
1082
1171
 
1083
1172
  struct llama_mmap {
1084
1173
  void * addr;
@@ -1279,6 +1368,7 @@ struct llama_mmap {
1279
1368
  }
1280
1369
  #endif
1281
1370
  };
1371
+ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
1282
1372
 
1283
1373
  // Represents some region of memory being locked using mlock or VirtualLock;
1284
1374
  // will automatically unlock on destruction.
@@ -1428,6 +1518,7 @@ struct llama_mlock {
1428
1518
  static void raw_unlock(const void * addr, size_t len) {}
1429
1519
  #endif
1430
1520
  };
1521
+ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1431
1522
 
1432
1523
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1433
1524
  std::vector<char> result(8, 0);
@@ -1447,7 +1538,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1447
1538
  static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
1448
1539
  ggml_backend_buffer_type_t buft = nullptr;
1449
1540
 
1450
- #if defined(GGML_USE_CUBLAS)
1541
+ #if defined(GGML_USE_CUDA)
1451
1542
  // host buffers should only be used when data is expected to be copied to/from the GPU
1452
1543
  if (host_buffer) {
1453
1544
  buft = ggml_backend_cuda_host_buffer_type();
@@ -1477,7 +1568,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1477
1568
 
1478
1569
  #ifdef GGML_USE_METAL
1479
1570
  buft = ggml_backend_metal_buffer_type();
1480
- #elif defined(GGML_USE_CUBLAS)
1571
+ #elif defined(GGML_USE_CUDA)
1481
1572
  buft = ggml_backend_cuda_buffer_type(gpu);
1482
1573
  #elif defined(GGML_USE_VULKAN)
1483
1574
  buft = ggml_backend_vk_buffer_type(gpu);
@@ -1503,7 +1594,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1503
1594
  static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1504
1595
  ggml_backend_buffer_type_t buft = nullptr;
1505
1596
 
1506
- #ifdef GGML_USE_CUBLAS
1597
+ #ifdef GGML_USE_CUDA
1507
1598
  if (ggml_backend_cuda_get_device_count() > 1) {
1508
1599
  buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1509
1600
  }
@@ -1524,7 +1615,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1524
1615
  }
1525
1616
 
1526
1617
  static size_t llama_get_device_count() {
1527
- #if defined(GGML_USE_CUBLAS)
1618
+ #if defined(GGML_USE_CUDA)
1528
1619
  return ggml_backend_cuda_get_device_count();
1529
1620
  #elif defined(GGML_USE_SYCL)
1530
1621
  return ggml_backend_sycl_get_device_count();
@@ -1536,7 +1627,7 @@ static size_t llama_get_device_count() {
1536
1627
  }
1537
1628
 
1538
1629
  static size_t llama_get_device_memory(int device) {
1539
- #if defined(GGML_USE_CUBLAS)
1630
+ #if defined(GGML_USE_CUDA)
1540
1631
  size_t total;
1541
1632
  size_t free;
1542
1633
  ggml_backend_cuda_get_device_memory(device, &total, &free);
@@ -1597,9 +1688,11 @@ enum e_model {
1597
1688
  MODEL_20B,
1598
1689
  MODEL_30B,
1599
1690
  MODEL_34B,
1691
+ MODEL_35B,
1600
1692
  MODEL_40B,
1601
1693
  MODEL_65B,
1602
1694
  MODEL_70B,
1695
+ MODEL_314B,
1603
1696
  MODEL_SMALL,
1604
1697
  MODEL_MEDIUM,
1605
1698
  MODEL_LARGE,
@@ -1643,6 +1736,7 @@ struct llama_hparams {
1643
1736
 
1644
1737
  float f_clamp_kqv = 0.0f;
1645
1738
  float f_max_alibi_bias = 0.0f;
1739
+ float f_logit_scale = 0.0f;
1646
1740
 
1647
1741
  bool causal_attn = true;
1648
1742
  bool need_kq_pos = false;
@@ -1716,6 +1810,7 @@ struct llama_cparams {
1716
1810
  uint32_t n_ctx; // context size used during inference
1717
1811
  uint32_t n_batch;
1718
1812
  uint32_t n_ubatch;
1813
+ uint32_t n_seq_max;
1719
1814
  uint32_t n_threads; // number of threads to use for generation
1720
1815
  uint32_t n_threads_batch; // number of threads to use for batch processing
1721
1816
 
@@ -1781,9 +1876,9 @@ struct llama_layer {
1781
1876
 
1782
1877
  // ff MoE
1783
1878
  struct ggml_tensor * ffn_gate_inp;
1784
- struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1785
- struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1786
- struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1879
+ struct ggml_tensor * ffn_gate_exps;
1880
+ struct ggml_tensor * ffn_down_exps;
1881
+ struct ggml_tensor * ffn_up_exps ;
1787
1882
 
1788
1883
  // ff bias
1789
1884
  struct ggml_tensor * ffn_down_b; // b2
@@ -1873,6 +1968,31 @@ struct llama_kv_cache {
1873
1968
  }
1874
1969
  };
1875
1970
 
1971
+ struct llama_control_vector {
1972
+ std::vector<struct ggml_tensor *> tensors; // per layer
1973
+ std::vector<struct ggml_context *> ctxs;
1974
+ std::vector<ggml_backend_buffer_t> bufs;
1975
+
1976
+ int32_t layer_start = -1;
1977
+ int32_t layer_end = -1;
1978
+
1979
+ ggml_tensor * tensor_for(int il) const {
1980
+ if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
1981
+ return nullptr;
1982
+ }
1983
+ return tensors[il];
1984
+ }
1985
+
1986
+ ~llama_control_vector() {
1987
+ for (struct ggml_context * ctx : ctxs) {
1988
+ ggml_free(ctx);
1989
+ }
1990
+ for (ggml_backend_buffer_t buf : bufs) {
1991
+ ggml_backend_buffer_free(buf);
1992
+ }
1993
+ }
1994
+ };
1995
+
1876
1996
  struct llama_vocab {
1877
1997
  using id = int32_t;
1878
1998
  using token = std::string;
@@ -1976,12 +2096,12 @@ struct llama_model {
1976
2096
  // the model memory buffers for the tensor data
1977
2097
  std::vector<ggml_backend_buffer_t> bufs;
1978
2098
 
1979
- // model memory mapped file
1980
- std::unique_ptr<llama_mmap> mapping;
2099
+ // model memory mapped files
2100
+ llama_mmaps mappings;
1981
2101
 
1982
2102
  // objects representing data potentially being locked in memory
1983
- std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
1984
- llama_mlock mlock_mmap;
2103
+ llama_mlocks mlock_bufs;
2104
+ llama_mlocks mlock_mmaps;
1985
2105
 
1986
2106
  // for quantize-stats only
1987
2107
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -1994,6 +2114,11 @@ struct llama_model {
1994
2114
  ggml_free(ctx);
1995
2115
  }
1996
2116
  for (ggml_backend_buffer_t buf : bufs) {
2117
+ #ifdef GGML_USE_CUDA
2118
+ if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
2119
+ ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
2120
+ }
2121
+ #endif
1997
2122
  ggml_backend_buffer_free(buf);
1998
2123
  }
1999
2124
  }
@@ -2008,10 +2133,6 @@ struct llama_context {
2008
2133
  ggml_backend_free(backend);
2009
2134
  }
2010
2135
 
2011
- #ifdef GGML_USE_VULKAN
2012
- ggml_vk_free_cpu_assist();
2013
- #endif
2014
-
2015
2136
  ggml_backend_buffer_free(buf_output);
2016
2137
  }
2017
2138
 
@@ -2048,20 +2169,20 @@ struct llama_context {
2048
2169
  // host buffer for the model output (logits and embeddings)
2049
2170
  ggml_backend_buffer_t buf_output = nullptr;
2050
2171
 
2051
- // decode output (2-dimensional array: [n_tokens][n_vocab])
2052
- size_t logits_size = 0;
2053
- float * logits = nullptr;
2172
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
2173
+ size_t logits_size = 0; // capacity (of floats) for logits
2174
+ float * logits = nullptr;
2175
+
2176
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2177
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2178
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
2054
2179
 
2055
- #ifndef NDEBUG
2056
- // guard against access to unset logits
2057
- std::vector<bool> logits_valid;
2058
- #endif
2059
2180
  bool logits_all = false;
2060
2181
 
2061
- // embeddings output (2-dimensional array: [n_tokens][n_embd])
2182
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
2062
2183
  // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
2063
- size_t embd_size = 0;
2064
- float * embd = nullptr;
2184
+ size_t embd_size = 0; // capacity (of floats) for embeddings
2185
+ float * embd = nullptr;
2065
2186
 
2066
2187
  // sequence embeddings output (map of [n_embd] vectors)
2067
2188
  // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
@@ -2078,14 +2199,18 @@ struct llama_context {
2078
2199
  struct ggml_tensor * inp_tokens; // I32 [n_batch]
2079
2200
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
2080
2201
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2202
+ struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2081
2203
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2082
- struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
2204
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2083
2205
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2084
2206
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2085
2207
  struct ggml_tensor * inp_cls; // I32 [n_batch]
2086
2208
  struct ggml_tensor * inp_s_copy; // I32 [kv_size]
2087
- struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
2088
- struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
2209
+ struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
2210
+ struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
2211
+
2212
+ // control vectors
2213
+ struct llama_control_vector cvec;
2089
2214
 
2090
2215
  #ifdef GGML_USE_MPI
2091
2216
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2737,6 +2862,8 @@ namespace GGUFMeta {
2737
2862
  };
2738
2863
  }
2739
2864
 
2865
+ using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
2866
+
2740
2867
  struct llama_model_loader {
2741
2868
  int n_kv = 0;
2742
2869
  int n_tensors = 0;
@@ -2747,54 +2874,133 @@ struct llama_model_loader {
2747
2874
 
2748
2875
  bool use_mmap = false;
2749
2876
 
2750
- llama_file file;
2877
+ llama_files files;
2751
2878
  llama_ftype ftype;
2752
2879
  llama_fver fver;
2753
2880
 
2754
- std::unique_ptr<llama_mmap> mapping;
2881
+ llama_mmaps mappings;
2882
+
2883
+ // Holds information on a model weight
2884
+ struct llama_tensor_weight {
2885
+ uint16_t idx; // source file index
2886
+ size_t offs; // tensor data offset in the original file
2887
+
2888
+ ggml_tensor * tensor;
2889
+
2890
+ llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2891
+ const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2892
+ offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
2893
+ }
2894
+ };
2895
+ std::vector<llama_tensor_weight> weights;
2896
+
2755
2897
  std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
2756
2898
 
2757
- struct gguf_context * ctx_gguf = NULL;
2758
- struct ggml_context * ctx_meta = NULL;
2899
+ struct gguf_context * meta = NULL;
2900
+ std::vector<ggml_context *> contexts;
2759
2901
 
2760
2902
  std::string arch_name;
2761
2903
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2762
2904
 
2763
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
2905
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
2764
2906
  int trace = 0;
2765
2907
  if (getenv("LLAMA_TRACE")) {
2766
2908
  trace = atoi(getenv("LLAMA_TRACE"));
2767
2909
  }
2768
2910
 
2769
- struct gguf_init_params params = {
2770
- /*.no_alloc = */ true,
2771
- /*.ctx = */ &ctx_meta,
2772
- };
2773
-
2774
2911
  if (param_overrides_p != nullptr) {
2775
2912
  for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
2776
2913
  kv_overrides.insert({std::string(p->key), *p});
2777
2914
  }
2778
2915
  }
2779
2916
 
2780
- ctx_gguf = gguf_init_from_file(fname.c_str(), params);
2781
- if (!ctx_gguf) {
2917
+ struct ggml_context * ctx = NULL;
2918
+ struct gguf_init_params params = {
2919
+ /*.no_alloc = */ true,
2920
+ /*.ctx = */ &ctx,
2921
+ };
2922
+
2923
+ meta = gguf_init_from_file(fname.c_str(), params);
2924
+ if (!meta) {
2782
2925
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
2783
2926
  }
2784
2927
 
2785
2928
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2786
2929
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2787
2930
 
2788
- n_kv = gguf_get_n_kv(ctx_gguf);
2789
- n_tensors = gguf_get_n_tensors(ctx_gguf);
2931
+ // Save tensors data offset of the main file.
2932
+ // For subsidiary files, `meta` tensor data offset must not be used,
2933
+ // so we build a unified tensors index for weights.
2934
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2935
+ weights.emplace_back(0, cur->name, meta, cur);
2936
+ }
2937
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
2938
+ contexts.emplace_back(ctx);
2939
+
2940
+ uint16_t n_split = 0;
2941
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
2942
+
2943
+ // Load additional GGML contexts
2944
+ if (n_split > 1) {
2945
+ uint16_t idx = 0;
2946
+ get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
2947
+ if (idx != 0) {
2948
+ throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
2949
+ }
2950
+
2951
+ char split_prefix[PATH_MAX] = {0};
2952
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
2953
+ throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
2954
+ }
2955
+
2956
+ if (trace > 0) {
2957
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
2958
+ }
2959
+
2960
+ char split_path[PATH_MAX] = {0};
2961
+ for (idx = 1; idx < n_split; idx++) {
2962
+ llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
2963
+
2964
+ struct gguf_init_params split_params = {
2965
+ /*.no_alloc = */ true,
2966
+ /*.ctx = */ &ctx,
2967
+ };
2968
+ struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
2969
+ if (!ctx_gguf) {
2970
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
2971
+ }
2972
+
2973
+ // Save tensors data offset info of the shard.
2974
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2975
+ weights.emplace_back(idx, cur->name, ctx_gguf, cur);
2976
+ }
2977
+ files.emplace_back(new llama_file(split_path, "rb"));
2978
+ contexts.emplace_back(ctx);
2979
+
2980
+ gguf_free(ctx_gguf);
2981
+ }
2982
+
2983
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
2984
+
2985
+ // sanity check
2986
+ {
2987
+ const int n_tensors_loaded = (int) weights.size();
2988
+ if (n_tensors != n_tensors_loaded) {
2989
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
2990
+ }
2991
+ }
2790
2992
 
2791
- fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
2993
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
2994
+ }
2995
+
2996
+ n_kv = gguf_get_n_kv(meta);
2997
+ n_tensors = weights.size();
2792
2998
 
2793
- for (int i = 0; i < n_tensors; i++) {
2794
- const char * name = gguf_get_tensor_name(ctx_gguf, i);
2795
- struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
2796
- n_elements += ggml_nelements(t);
2797
- n_bytes += ggml_nbytes(t);
2999
+ fver = (enum llama_fver) gguf_get_version(meta);
3000
+
3001
+ for (auto & w : weights) {
3002
+ n_elements += ggml_nelements(w.tensor);
3003
+ n_bytes += ggml_nbytes(w.tensor);
2798
3004
  }
2799
3005
 
2800
3006
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -2809,7 +3015,8 @@ struct llama_model_loader {
2809
3015
  enum ggml_type type_max = GGML_TYPE_F32;
2810
3016
 
2811
3017
  for (int i = 0; i < n_tensors; i++) {
2812
- enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
3018
+ const ggml_tensor * tensor = weights.at(i).tensor;
3019
+ enum ggml_type type = tensor->type;
2813
3020
 
2814
3021
  n_type[type]++;
2815
3022
 
@@ -2819,8 +3026,8 @@ struct llama_model_loader {
2819
3026
  }
2820
3027
 
2821
3028
  if (trace > 0) {
2822
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2823
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
3029
+ const uint16_t sid = weights.at(i).idx;
3030
+ LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
2824
3031
  }
2825
3032
  }
2826
3033
 
@@ -2842,6 +3049,7 @@ struct llama_model_loader {
2842
3049
  case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2843
3050
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2844
3051
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
3052
+ case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
2845
3053
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2846
3054
  case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2847
3055
  case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
@@ -2856,22 +3064,23 @@ struct llama_model_loader {
2856
3064
  ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
2857
3065
 
2858
3066
  {
2859
- const int kid = gguf_find_key(ctx_gguf, "general.file_type");
3067
+ const int kid = gguf_find_key(meta, "general.file_type");
2860
3068
  if (kid >= 0) {
2861
- ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
3069
+ ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
2862
3070
  }
2863
3071
  }
2864
3072
 
2865
3073
  LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
3074
+
2866
3075
  for (int i = 0; i < n_kv; i++) {
2867
- const char * name = gguf_get_key(ctx_gguf, i);
2868
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
3076
+ const char * name = gguf_get_key(meta, i);
3077
+ const enum gguf_type type = gguf_get_kv_type(meta, i);
2869
3078
  const std::string type_name =
2870
3079
  type == GGUF_TYPE_ARRAY
2871
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
3080
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
2872
3081
  : gguf_type_name(type);
2873
3082
 
2874
- std::string value = gguf_kv_to_str(ctx_gguf, i);
3083
+ std::string value = gguf_kv_to_str(meta, i);
2875
3084
  const size_t MAX_VALUE_LEN = 40;
2876
3085
  if (value.size() > MAX_VALUE_LEN) {
2877
3086
  value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -2900,18 +3109,18 @@ struct llama_model_loader {
2900
3109
  }
2901
3110
 
2902
3111
  ~llama_model_loader() {
2903
- if (ctx_gguf) {
2904
- gguf_free(ctx_gguf);
3112
+ if (meta) {
3113
+ gguf_free(meta);
2905
3114
  }
2906
- if (ctx_meta) {
2907
- ggml_free(ctx_meta);
3115
+ for (auto * ctx : contexts) {
3116
+ ggml_free(ctx);
2908
3117
  }
2909
3118
  }
2910
3119
 
2911
3120
  template<typename T>
2912
3121
  typename std::enable_if<std::is_integral<T>::value, bool>::type
2913
3122
  get_arr_n(const std::string & key, T & result, const bool required = true) {
2914
- const int kid = gguf_find_key(ctx_gguf, key.c_str());
3123
+ const int kid = gguf_find_key(meta, key.c_str());
2915
3124
 
2916
3125
  if (kid < 0) {
2917
3126
  if (required) {
@@ -2921,7 +3130,7 @@ struct llama_model_loader {
2921
3130
  }
2922
3131
 
2923
3132
  struct GGUFMeta::ArrayInfo arr_info =
2924
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
3133
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
2925
3134
 
2926
3135
 
2927
3136
  result = arr_info.length;
@@ -2941,7 +3150,7 @@ struct llama_model_loader {
2941
3150
  const struct llama_model_kv_override * override =
2942
3151
  it != kv_overrides.end() ? &it->second : nullptr;
2943
3152
 
2944
- const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
3153
+ const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
2945
3154
 
2946
3155
  if (required && !found) {
2947
3156
  throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -2964,28 +3173,57 @@ struct llama_model_loader {
2964
3173
  }
2965
3174
 
2966
3175
  const char * get_tensor_name(int i) const {
2967
- return gguf_get_tensor_name(ctx_gguf, i);
3176
+ return weights.at(i).tensor->name;
3177
+ }
3178
+
3179
+ const llama_tensor_weight * get_weight(const char * name) const {
3180
+ for (const auto & weight : weights) {
3181
+ if (strcmp(name, weight.tensor->name) == 0) {
3182
+ return &weight;
3183
+ }
3184
+ }
3185
+ return nullptr;
3186
+ }
3187
+
3188
+ const llama_tensor_weight & require_weight(const char * name) const {
3189
+ const llama_tensor_weight * weight = get_weight(name);
3190
+ if (!weight) {
3191
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3192
+ }
3193
+ return *weight;
2968
3194
  }
2969
3195
 
2970
3196
  struct ggml_tensor * get_tensor_meta(const char * name) const {
2971
- return ggml_get_tensor(ctx_meta, name);
3197
+ const auto * weight = get_weight(name);
3198
+ if (!weight) {
3199
+ return nullptr;
3200
+ }
3201
+ return weight->tensor;
3202
+ }
3203
+
3204
+ struct ggml_tensor * require_tensor_meta(const char * name) const {
3205
+ struct ggml_tensor * tensor = get_tensor_meta(name);
3206
+ if (!tensor) {
3207
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3208
+ }
3209
+ return tensor;
2972
3210
  }
2973
3211
 
2974
3212
  struct ggml_tensor * get_tensor_meta(int i) const {
2975
3213
  return get_tensor_meta(get_tensor_name(i));
2976
3214
  }
2977
3215
 
2978
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
2979
- struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
2980
- ggml_set_name(tensor, ggml_get_name(meta));
3216
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3217
+ struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3218
+ ggml_set_name(tensor, ggml_get_name(cur));
2981
3219
 
2982
3220
  n_created++;
2983
3221
 
2984
3222
  return tensor;
2985
3223
  }
2986
3224
 
2987
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
2988
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
3225
+ const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
3226
+ const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
2989
3227
 
2990
3228
  if (cur == NULL) {
2991
3229
  if (!required) {
@@ -2996,8 +3234,8 @@ struct llama_model_loader {
2996
3234
 
2997
3235
  {
2998
3236
  bool is_ok = true;
2999
- for (size_t i = 0; i < ne.size(); ++i) {
3000
- if (ne[i] != cur->ne[i]) {
3237
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3238
+ if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
3001
3239
  is_ok = false;
3002
3240
  break;
3003
3241
  }
@@ -3011,127 +3249,196 @@ struct llama_model_loader {
3011
3249
  }
3012
3250
  }
3013
3251
 
3014
- return create_tensor_for(ctx, cur);
3252
+ return cur;
3015
3253
  }
3016
3254
 
3017
- void done_getting_tensors() const {
3018
- if (n_created != n_tensors) {
3019
- throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3255
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3256
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3257
+
3258
+ if (cur == NULL) {
3259
+ return NULL;
3020
3260
  }
3261
+
3262
+ return create_tensor_for(ctx, cur);
3021
3263
  }
3022
3264
 
3023
- size_t file_offset(const char * name) const {
3024
- const int idx = gguf_find_tensor(ctx_gguf, name);
3265
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
3266
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3025
3267
 
3026
- if (idx < 0) {
3027
- throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
3268
+ if (cur == NULL) {
3269
+ return NULL;
3028
3270
  }
3029
3271
 
3030
- return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
3031
- }
3272
+ if (cur->type != base->type) {
3273
+ throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
3274
+ }
3032
3275
 
3033
- void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
3034
- // prefetch the whole file - all the data is needed anyway
3035
- if (use_mmap) {
3036
- mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
3276
+ std::array<int64_t, GGML_MAX_DIMS> dims;
3277
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3278
+ dims[i] = i < ne.size() ? ne[i] : 1;
3037
3279
  }
3038
3280
 
3039
- // compute the total size of all tensors for progress reporting
3040
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
3041
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
3042
- size_data += ggml_nbytes(cur);
3281
+ struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
3282
+ dims[0], dims[1], dims[2], dims[3],
3283
+ cur->nb[1], cur->nb[2], cur->nb[3],
3284
+ offset);
3285
+
3286
+ ggml_set_name(tensor, name.c_str());
3287
+
3288
+ n_created++;
3289
+
3290
+ return tensor;
3291
+ }
3292
+
3293
+ void done_getting_tensors() const {
3294
+ if (n_created != n_tensors) {
3295
+ throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3043
3296
  }
3297
+ }
3044
3298
 
3045
- if (use_mmap && mapping) {
3046
- if (lmlock) {
3047
- lmlock->init(mapping->addr);
3299
+ void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
3300
+ if (use_mmap) {
3301
+ mappings.reserve(files.size());
3302
+ mmaps_used.reserve(files.size());
3303
+ for (const auto & file : files) {
3304
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3305
+ mmaps_used.emplace_back(mapping->size, 0);
3306
+ if (mlock_mmaps) {
3307
+ std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3308
+ mlock_mmap->init(mapping->addr);
3309
+ mlock_mmaps->emplace_back(std::move(mlock_mmap));
3310
+ }
3311
+ mappings.emplace_back(std::move(mapping));
3048
3312
  }
3049
- mmap_used_first = mapping->size;
3313
+ }
3314
+
3315
+ // compute the total size of all tensors for progress reporting
3316
+ for (auto & w : weights) {
3317
+ size_data += ggml_nbytes(w.tensor);
3050
3318
  }
3051
3319
  }
3052
3320
 
3053
- void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
3054
- GGML_ASSERT(mapping);
3321
+ void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3322
+ GGML_ASSERT(!mappings.empty());
3323
+ const auto & mapping = mappings.at(idx);
3055
3324
 
3056
3325
  *first = mapping->size;
3057
3326
  *last = 0;
3327
+ *addr = mapping->addr;
3058
3328
  for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
3059
- const size_t offs = file_offset(ggml_get_name(tensor));
3060
- *first = std::min(*first, offs);
3061
- *last = std::max(*last, offs + ggml_nbytes(tensor));
3329
+ try {
3330
+ const auto * weight = get_weight(ggml_get_name(tensor));
3331
+ if (!weight) {
3332
+ continue;
3333
+ }
3334
+ if (weight->idx != idx) {
3335
+ continue;
3336
+ }
3337
+ *first = std::min(*first, weight->offs);
3338
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
3339
+ } catch(...) {
3340
+ // the tensor is not in the model
3341
+ }
3062
3342
  }
3063
3343
  }
3064
3344
 
3065
3345
  // for backwards compatibility, does not support ggml-backend
3066
3346
  void load_data_for(struct ggml_tensor * cur) const {
3067
- const size_t offs = file_offset(ggml_get_name(cur));
3347
+ const auto & w = require_weight(ggml_get_name(cur));
3068
3348
 
3069
- if (use_mmap && mapping) {
3349
+ if (use_mmap) {
3350
+ const auto & mapping = mappings.at(w.idx);
3070
3351
  if (cur->data == nullptr) {
3071
- cur->data = (uint8_t *)mapping->addr + offs;
3352
+ cur->data = (uint8_t *)mapping->addr + w.offs;
3072
3353
  } else {
3073
- memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
3354
+ memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
3074
3355
  }
3075
3356
  } else {
3076
3357
  GGML_ASSERT(cur->data != nullptr);
3077
- file.seek(offs, SEEK_SET);
3078
- file.read_raw(cur->data, ggml_nbytes(cur));
3358
+ GGML_ASSERT(w.idx < files.size());
3359
+ const auto & file = files.at(w.idx);
3360
+ file->seek(w.offs, SEEK_SET);
3361
+ file->read_raw(cur->data, ggml_nbytes(cur));
3079
3362
  }
3080
3363
  }
3081
3364
 
3082
3365
  size_t size_done = 0;
3083
3366
  size_t size_data = 0;
3084
- size_t mmap_used_first = -1;
3085
- size_t mmap_used_last = 0;
3367
+ std::vector<std::pair<size_t, size_t>> mmaps_used;
3086
3368
 
3087
3369
  // Returns false if cancelled by progress_callback
3088
- bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
3089
- GGML_ASSERT(size_data != 0 && "call init_mapping() first");
3370
+ bool load_all_data(
3371
+ struct ggml_context * ctx,
3372
+ llama_buf_map & bufs_mmap,
3373
+ llama_mlocks * lmlocks,
3374
+ llama_progress_callback progress_callback,
3375
+ void * progress_callback_user_data) {
3376
+ GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3090
3377
 
3091
3378
  std::vector<no_init<uint8_t>> read_buf;
3092
-
3093
3379
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3380
+ const auto * weight = get_weight(ggml_get_name(cur));
3381
+ if (weight == nullptr) {
3382
+ // this can happen with split experts models
3383
+ continue;
3384
+ }
3385
+
3094
3386
  if (progress_callback) {
3095
3387
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
3096
3388
  return false;
3097
3389
  }
3098
3390
  }
3099
3391
 
3100
- const size_t offs = file_offset(ggml_get_name(cur));
3392
+ size_t n_size = ggml_nbytes(cur);
3101
3393
 
3102
- if (use_mmap && mapping) {
3394
+ if (use_mmap) {
3395
+ const auto & mapping = mappings.at(weight->idx);
3396
+ ggml_backend_buffer_t buf_mmap = nullptr;
3397
+ if (bufs_mmap.count(weight->idx)) {
3398
+ buf_mmap = bufs_mmap.at(weight->idx);
3399
+ }
3400
+ GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3103
3401
  if (buf_mmap && cur->data == nullptr) {
3104
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
3105
- if (lmlock) {
3106
- lmlock->grow_to(offs + ggml_nbytes(cur));
3402
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3403
+ if (lmlocks) {
3404
+ const auto & lmlock = lmlocks->at(weight->idx);
3405
+ lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3107
3406
  }
3108
- mmap_used_first = std::min(mmap_used_first, offs);
3109
- mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
3407
+
3408
+ auto & mmap_used = mmaps_used[weight->idx];
3409
+ mmap_used.first = std::min(mmap_used.first, weight->offs);
3410
+ mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3110
3411
  } else {
3111
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
3412
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3112
3413
  }
3113
3414
  } else {
3415
+ GGML_ASSERT(weight->idx < files.size());
3416
+ const auto & file = files.at(weight->idx);
3114
3417
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3115
- file.seek(offs, SEEK_SET);
3116
- file.read_raw(cur->data, ggml_nbytes(cur));
3418
+ file->seek(weight->offs, SEEK_SET);
3419
+ file->read_raw(cur->data, ggml_nbytes(cur));
3117
3420
  } else {
3118
3421
  read_buf.resize(ggml_nbytes(cur));
3119
- file.seek(offs, SEEK_SET);
3120
- file.read_raw(read_buf.data(), ggml_nbytes(cur));
3121
- ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
3422
+ file->seek(weight->offs, SEEK_SET);
3423
+ file->read_raw(read_buf.data(), ggml_nbytes(cur));
3424
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3122
3425
  }
3123
3426
  }
3124
3427
 
3125
- size_done += ggml_nbytes(cur);
3428
+ size_done += n_size;
3126
3429
  }
3127
3430
 
3128
3431
  // check if this is the last call and do final cleanup
3129
3432
  if (size_done >= size_data) {
3130
3433
  // unmap offloaded tensors and metadata
3131
- if (use_mmap && mapping) {
3132
- mapping->unmap_fragment(0, mmap_used_first);
3133
- if (mmap_used_last != 0) {
3134
- mapping->unmap_fragment(mmap_used_last, mapping->size);
3434
+ if (use_mmap) {
3435
+ for (uint32_t idx = 0; idx < mappings.size(); idx++) {
3436
+ const auto & mmap_used = mmaps_used.at(idx);
3437
+ auto & mapping = mappings.at(idx);
3438
+ mapping->unmap_fragment(0, mmap_used.first);
3439
+ if (mmap_used.second != 0) {
3440
+ mapping->unmap_fragment(mmap_used.second, mapping->size);
3441
+ }
3135
3442
  }
3136
3443
  }
3137
3444
  if (progress_callback) {
@@ -3204,6 +3511,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3204
3511
  case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
3205
3512
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
3206
3513
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
3514
+ case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
3207
3515
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
3208
3516
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
3209
3517
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
@@ -3231,9 +3539,11 @@ static const char * llama_model_type_name(e_model type) {
3231
3539
  case MODEL_20B: return "20B";
3232
3540
  case MODEL_30B: return "30B";
3233
3541
  case MODEL_34B: return "34B";
3542
+ case MODEL_35B: return "35B";
3234
3543
  case MODEL_40B: return "40B";
3235
3544
  case MODEL_65B: return "65B";
3236
3545
  case MODEL_70B: return "70B";
3546
+ case MODEL_314B: return "314B";
3237
3547
  case MODEL_SMALL: return "0.1B";
3238
3548
  case MODEL_MEDIUM: return "0.4B";
3239
3549
  case MODEL_LARGE: return "0.8B";
@@ -3263,7 +3573,7 @@ static void llm_load_hparams(
3263
3573
  llama_model_loader & ml,
3264
3574
  llama_model & model) {
3265
3575
  auto & hparams = model.hparams;
3266
- const gguf_context * ctx = ml.ctx_gguf;
3576
+ const gguf_context * ctx = ml.meta;
3267
3577
 
3268
3578
  // get metadata as string
3269
3579
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -3372,6 +3682,15 @@ static void llm_load_hparams(
3372
3682
  default: model.type = e_model::MODEL_UNKNOWN;
3373
3683
  }
3374
3684
  } break;
3685
+ case LLM_ARCH_GROK:
3686
+ {
3687
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3688
+
3689
+ switch (hparams.n_layer) {
3690
+ case 64: model.type = e_model::MODEL_314B; break;
3691
+ default: model.type = e_model::MODEL_UNKNOWN;
3692
+ }
3693
+ } break;
3375
3694
  case LLM_ARCH_FALCON:
3376
3695
  {
3377
3696
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3623,6 +3942,25 @@ static void llm_load_hparams(
3623
3942
  default: model.type = e_model::MODEL_UNKNOWN;
3624
3943
  }
3625
3944
  } break;
3945
+ case LLM_ARCH_XVERSE:
3946
+ {
3947
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3948
+ switch (hparams.n_layer) {
3949
+ case 32: model.type = e_model::MODEL_7B; break;
3950
+ case 40: model.type = e_model::MODEL_13B; break;
3951
+ case 80: model.type = e_model::MODEL_65B; break;
3952
+ default: model.type = e_model::MODEL_UNKNOWN;
3953
+ }
3954
+ } break;
3955
+ case LLM_ARCH_COMMAND_R:
3956
+ {
3957
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
3958
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3959
+ switch (hparams.n_layer) {
3960
+ case 40: model.type = e_model::MODEL_35B; break;
3961
+ default: model.type = e_model::MODEL_UNKNOWN;
3962
+ }
3963
+ } break;
3626
3964
  default: (void)0;
3627
3965
  }
3628
3966
 
@@ -3644,7 +3982,7 @@ static void llm_load_vocab(
3644
3982
  llama_model & model) {
3645
3983
  auto & vocab = model.vocab;
3646
3984
 
3647
- struct gguf_context * ctx = ml.ctx_gguf;
3985
+ struct gguf_context * ctx = ml.meta;
3648
3986
 
3649
3987
  const auto kv = LLM_KV(model.arch);
3650
3988
 
@@ -3777,7 +4115,7 @@ static void llm_load_vocab(
3777
4115
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3778
4116
  vocab.linefeed_id = vocab.special_pad_id;
3779
4117
  } else {
3780
- const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
4118
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
3781
4119
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
3782
4120
  vocab.linefeed_id = ids[0];
3783
4121
  }
@@ -3944,6 +4282,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3944
4282
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
3945
4283
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
3946
4284
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
4285
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
3947
4286
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3948
4287
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3949
4288
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
@@ -4009,6 +4348,7 @@ static bool llm_load_tensors(
4009
4348
 
4010
4349
  const int64_t n_layer = hparams.n_layer;
4011
4350
  const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
4351
+ bool use_mmap_buffer = true;
4012
4352
 
4013
4353
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
4014
4354
  model.buft_input = llama_default_buffer_type_cpu(true);
@@ -4097,6 +4437,10 @@ static bool llm_load_tensors(
4097
4437
 
4098
4438
  // create one context per buffer type
4099
4439
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4440
+
4441
+ // for moe merged tensors
4442
+ ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4443
+
4100
4444
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4101
4445
  for (auto & it : buft_layer_count) {
4102
4446
  struct ggml_init_params params = {
@@ -4123,6 +4467,11 @@ static bool llm_load_tensors(
4123
4467
  const int64_t n_vocab = hparams.n_vocab;
4124
4468
  const int64_t n_vocab_type = hparams.n_vocab_type;
4125
4469
  const int64_t n_ff = hparams.n_ff;
4470
+ const int64_t n_expert = hparams.n_expert;
4471
+
4472
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
4473
+ throw std::runtime_error("model has expert layers but no expert layers are used");
4474
+ }
4126
4475
 
4127
4476
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4128
4477
 
@@ -4177,26 +4526,113 @@ static bool llm_load_tensors(
4177
4526
 
4178
4527
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4179
4528
 
4180
- layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
4181
-
4182
- if (layer.ffn_gate_inp == nullptr) {
4183
- GGML_ASSERT(hparams.n_expert == 0);
4184
- GGML_ASSERT(hparams.n_expert_used == 0);
4185
-
4529
+ if (n_expert == 0) {
4186
4530
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4187
4531
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4188
4532
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4189
4533
  } else {
4190
- GGML_ASSERT(hparams.n_expert > 0);
4191
- GGML_ASSERT(hparams.n_expert_used > 0);
4192
-
4193
- // MoE branch
4194
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
4195
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
4196
- layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
4197
- layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
4534
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4535
+
4536
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4537
+ if (layer.ffn_gate_exps) {
4538
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4539
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4540
+ } else {
4541
+ // merge split expert into a single tensor for compatibility with older models
4542
+ // requires disabling mmap
4543
+ use_mmap_buffer = false;
4544
+
4545
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4546
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4547
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4548
+
4549
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4550
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4551
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4552
+
4553
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4554
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4555
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4556
+
4557
+ for (uint32_t x = 0; x < n_expert; ++x) {
4558
+ // the individual experts are loaded into a view of the merged tensor
4559
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4560
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4561
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4562
+ }
4563
+ }
4564
+ }
4565
+ }
4566
+ } break;
4567
+ case LLM_ARCH_GROK:
4568
+ {
4569
+ if (n_expert == 0) {
4570
+ throw std::runtime_error("Grok model cannot have zero experts");
4571
+ }
4572
+
4573
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4574
+
4575
+ // output
4576
+ {
4577
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4578
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4579
+ // if output is NULL, init from the input tok embed
4580
+ if (model.output == NULL) {
4581
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4582
+ ml.n_created--; // artificial tensor
4583
+ ml.size_data += ggml_nbytes(model.output);
4584
+ }
4585
+ }
4586
+
4587
+ for (int i = 0; i < n_layer; ++i) {
4588
+ ggml_context * ctx_layer = ctx_for_layer(i);
4589
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4590
+
4591
+ auto & layer = model.layers[i];
4592
+
4593
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4594
+
4595
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4596
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4597
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4598
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4599
+
4600
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4601
+
4602
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4603
+
4604
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4605
+
4606
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4607
+ if (layer.ffn_gate_exps) {
4608
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4609
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4610
+ } else {
4611
+ // merge split expert into a single tensor for compatibility with older models
4612
+ // requires disabling mmap
4613
+ use_mmap_buffer = false;
4614
+
4615
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4616
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4617
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4618
+
4619
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4620
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4621
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4622
+
4623
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4624
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4625
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4626
+
4627
+ for (uint32_t x = 0; x < n_expert; ++x) {
4628
+ // the individual experts are loaded into a view of the merged tensor
4629
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4630
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4631
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4198
4632
  }
4199
4633
  }
4634
+
4635
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4200
4636
  }
4201
4637
  } break;
4202
4638
  case LLM_ARCH_BAICHUAN:
@@ -4235,9 +4671,9 @@ static bool llm_load_tensors(
4235
4671
  {
4236
4672
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4237
4673
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4238
- if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
4239
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4240
- } else {
4674
+
4675
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4676
+ if (!model.output) {
4241
4677
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
4242
4678
  ml.n_created--; // artificial tensor
4243
4679
  ml.size_data += ggml_nbytes(model.output);
@@ -4253,10 +4689,8 @@ static bool llm_load_tensors(
4253
4689
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4254
4690
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4255
4691
 
4256
- if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
4257
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
4258
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
4259
- }
4692
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
4693
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
4260
4694
 
4261
4695
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4262
4696
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -4436,16 +4870,19 @@ static bool llm_load_tensors(
4436
4870
  case LLM_ARCH_MPT:
4437
4871
  {
4438
4872
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4873
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
4439
4874
 
4440
4875
  // output
4441
4876
  {
4442
4877
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4443
4878
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4444
4879
 
4445
- // same as tok_embd, duplicated to allow offloading
4446
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4447
- ml.n_created--; // artificial tensor
4448
- ml.size_data += ggml_nbytes(model.output);
4880
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4881
+ if (!model.output) {
4882
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
4883
+ ml.n_created--; // artificial tensor
4884
+ ml.size_data += ggml_nbytes(model.output);
4885
+ }
4449
4886
  }
4450
4887
 
4451
4888
  for (int i = 0; i < n_layer; ++i) {
@@ -4472,6 +4909,12 @@ static bool llm_load_tensors(
4472
4909
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4473
4910
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4474
4911
 
4912
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
4913
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
4914
+
4915
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
4916
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
4917
+
4475
4918
  // AWQ ScaleActivation layer
4476
4919
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
4477
4920
  }
@@ -4918,6 +5361,59 @@ static bool llm_load_tensors(
4918
5361
  layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
4919
5362
  }
4920
5363
  } break;
5364
+ case LLM_ARCH_XVERSE:
5365
+ {
5366
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5367
+ {
5368
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5369
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5370
+ }
5371
+ for (int i = 0; i < n_layer; ++i) {
5372
+ ggml_context * ctx_layer = ctx_for_layer(i);
5373
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5374
+ auto & layer = model.layers[i];
5375
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5376
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5377
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5378
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5379
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5380
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5381
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5382
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5383
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5384
+ }
5385
+ } break;
5386
+ case LLM_ARCH_COMMAND_R:
5387
+ {
5388
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5389
+
5390
+ // output
5391
+ {
5392
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5393
+ // init output from the input tok embed
5394
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5395
+ ml.n_created--; // artificial tensor
5396
+ ml.size_data += ggml_nbytes(model.output);
5397
+ }
5398
+
5399
+ for (int i = 0; i < n_layer; ++i) {
5400
+ ggml_context * ctx_layer = ctx_for_layer(i);
5401
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5402
+
5403
+ auto & layer = model.layers[i];
5404
+
5405
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5406
+
5407
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5408
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5409
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5410
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5411
+
5412
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5413
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5414
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5415
+ }
5416
+ } break;
4921
5417
  default:
4922
5418
  throw std::runtime_error("unknown architecture");
4923
5419
  }
@@ -4925,49 +5421,97 @@ static bool llm_load_tensors(
4925
5421
 
4926
5422
  ml.done_getting_tensors();
4927
5423
 
4928
- ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
5424
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
5425
+ model.mappings.reserve(ml.mappings.size());
4929
5426
 
4930
5427
  // create the backend buffers
4931
- std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
5428
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
5429
+ ctx_bufs.reserve(ctx_map.size());
5430
+
5431
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
5432
+ size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
5433
+ model.bufs.reserve(n_max_backend_buffer);
4932
5434
 
4933
5435
  for (auto & it : ctx_map) {
4934
5436
  ggml_backend_buffer_type_t buft = it.first;
4935
- ggml_context * ctx = it.second;
4936
- ggml_backend_buffer_t buf = nullptr;
5437
+ ggml_context * ctx = it.second;
5438
+
5439
+ llama_buf_map bufs;
5440
+ bufs.reserve(n_max_backend_buffer);
4937
5441
 
4938
5442
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
4939
5443
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
4940
5444
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
4941
- if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
4942
- size_t first, last;
4943
- ml.get_mapping_range(&first, &last, ctx);
4944
- buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
5445
+ if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
5446
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5447
+ void * addr = nullptr;
5448
+ size_t first, last;
5449
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5450
+ if (first >= last) {
5451
+ continue;
5452
+ }
5453
+ ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
5454
+ if (buf == nullptr) {
5455
+ throw std::runtime_error("unable to allocate backend CPU buffer");
5456
+ }
5457
+ model.bufs.push_back(buf);
5458
+ bufs.emplace(idx, buf);
5459
+ #ifdef GGML_USE_CUDA
5460
+ if (n_layer >= n_gpu_layers) {
5461
+ ggml_backend_cuda_register_host_buffer(
5462
+ ggml_backend_buffer_get_base(buf),
5463
+ ggml_backend_buffer_get_size(buf));
5464
+ }
5465
+ #endif
5466
+ }
4945
5467
  }
4946
5468
  #ifdef GGML_USE_METAL
4947
- else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
4948
- const size_t max_size = ggml_get_max_tensor_size(ctx);
4949
- size_t first, last;
4950
- ml.get_mapping_range(&first, &last, ctx);
4951
- buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
5469
+ else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
5470
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5471
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
5472
+ void * addr = nullptr;
5473
+ size_t first, last;
5474
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5475
+ if (first >= last) {
5476
+ continue;
5477
+ }
5478
+ ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
5479
+ if (buf == nullptr) {
5480
+ throw std::runtime_error("unable to allocate backend metal buffer");
5481
+ }
5482
+ model.bufs.push_back(buf);
5483
+ bufs.emplace(idx, buf);
5484
+ }
4952
5485
  }
4953
5486
  #endif
4954
5487
  else {
4955
- buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
4956
- if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
5488
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
5489
+ if (buf == nullptr) {
5490
+ throw std::runtime_error("unable to allocate backend buffer");
5491
+ }
5492
+ model.bufs.push_back(buf);
5493
+ if (use_mlock && ggml_backend_buffer_is_host(buf)) {
4957
5494
  model.mlock_bufs.emplace_back(new llama_mlock);
4958
5495
  auto & mlock_buf = model.mlock_bufs.back();
4959
5496
  mlock_buf->init (ggml_backend_buffer_get_base(buf));
4960
5497
  mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
4961
5498
  }
5499
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5500
+ bufs.emplace(idx, buf);
5501
+ }
4962
5502
  }
4963
- if (buf == nullptr) {
5503
+
5504
+ if (bufs.empty()) {
4964
5505
  throw std::runtime_error("failed to allocate buffer");
4965
5506
  }
4966
- // indicate that this buffer contains weights
4967
- // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
4968
- ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
4969
- model.bufs.push_back(buf);
4970
- ctx_bufs.emplace_back(ctx, buf);
5507
+
5508
+ for (auto & buf : bufs) {
5509
+ // indicate that this buffer contains weights
5510
+ // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5511
+ ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5512
+ }
5513
+
5514
+ ctx_bufs.emplace_back(ctx, bufs);
4971
5515
  }
4972
5516
 
4973
5517
  if (llama_supports_gpu_offload()) {
@@ -4999,13 +5543,17 @@ static bool llm_load_tensors(
4999
5543
  // load tensor data
5000
5544
  for (auto & it : ctx_bufs) {
5001
5545
  ggml_context * ctx = it.first;
5002
- ggml_backend_buffer_t buf = it.second;
5003
- if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
5546
+ auto & bufs = it.second;
5547
+ if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
5004
5548
  return false;
5005
5549
  }
5006
5550
  }
5007
5551
 
5008
- model.mapping = std::move(ml.mapping);
5552
+ if (use_mmap_buffer) {
5553
+ for (auto & mapping : ml.mappings) {
5554
+ model.mappings.emplace_back(std::move(mapping));
5555
+ }
5556
+ }
5009
5557
 
5010
5558
  // loading time will be recalculate after the first eval, so
5011
5559
  // we take page faults deferred by mmap() into consideration
@@ -5064,6 +5612,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
5064
5612
  }
5065
5613
  #endif
5066
5614
 
5615
+ #ifdef GGML_USE_SYCL
5616
+ if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
5617
+ ggml_backend_sycl_set_single_device_mode(params.main_gpu);
5618
+ //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
5619
+ params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
5620
+ } else {
5621
+ ggml_backend_sycl_set_mul_device_mode();
5622
+ }
5623
+ #endif
5624
+
5067
5625
  if (!llm_load_tensors(
5068
5626
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
5069
5627
  params.progress_callback, params.progress_callback_user_data
@@ -5150,8 +5708,8 @@ static void llm_build_kv_store(
5150
5708
  GGML_ASSERT(kv.size == n_ctx);
5151
5709
 
5152
5710
  // compute the transposed [n_tokens, n_embd] V matrix
5153
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
5154
- //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
5711
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
5712
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
5155
5713
  cb(v_cur_t, "v_cur_t", il);
5156
5714
 
5157
5715
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
@@ -5335,6 +5893,20 @@ static struct ggml_tensor * llm_build_kqv(
5335
5893
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5336
5894
  }
5337
5895
 
5896
+ if (model.arch == LLM_ARCH_GROK) {
5897
+ // need to do the following:
5898
+ // multiply by attn_output_multiplyer of 0.08838834764831845
5899
+ // and then :
5900
+ // kq = 30 * tanh(kq / 30)
5901
+ // before the softmax below
5902
+
5903
+ //try from phi2
5904
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5905
+
5906
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
5907
+ kq = ggml_scale(ctx, kq, 30);
5908
+ }
5909
+
5338
5910
  #if defined(GGML_USE_KOMPUTE)
5339
5911
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
5340
5912
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
@@ -5461,7 +6033,8 @@ struct llm_build_context {
5461
6033
  const float norm_rms_eps;
5462
6034
 
5463
6035
  const int32_t n_tokens;
5464
- const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
6036
+ const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
6037
+ const int32_t n_outputs;
5465
6038
  const int32_t kv_head; // index of where we store new KV data in the cache
5466
6039
  const int32_t n_orig_ctx;
5467
6040
 
@@ -5508,6 +6081,7 @@ struct llm_build_context {
5508
6081
  norm_rms_eps (hparams.f_norm_rms_eps),
5509
6082
  n_tokens (batch.n_tokens),
5510
6083
  n_kv (worst_case ? kv_self.size : kv_self.n),
6084
+ n_outputs (worst_case ? n_tokens : lctx.n_outputs),
5511
6085
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
5512
6086
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5513
6087
  pooling_type (cparams.pooling_type),
@@ -5529,6 +6103,7 @@ struct llm_build_context {
5529
6103
  lctx.inp_tokens = nullptr;
5530
6104
  lctx.inp_embd = nullptr;
5531
6105
  lctx.inp_pos = nullptr;
6106
+ lctx.inp_out_ids = nullptr;
5532
6107
  lctx.inp_KQ_mask = nullptr;
5533
6108
  lctx.inp_KQ_pos = nullptr;
5534
6109
  lctx.inp_K_shift = nullptr;
@@ -5652,6 +6227,13 @@ struct llm_build_context {
5652
6227
  return lctx.inp_pos;
5653
6228
  }
5654
6229
 
6230
+ struct ggml_tensor * build_inp_out_ids() {
6231
+ lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
6232
+ cb(lctx.inp_out_ids, "inp_out_ids", -1);
6233
+ ggml_set_input(lctx.inp_out_ids);
6234
+ return lctx.inp_out_ids;
6235
+ }
6236
+
5655
6237
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
5656
6238
  if (causal) {
5657
6239
  lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
@@ -5708,6 +6290,9 @@ struct llm_build_context {
5708
6290
  struct ggml_cgraph * build_llama() {
5709
6291
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5710
6292
 
6293
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6294
+ int32_t n_tokens = this->n_tokens;
6295
+
5711
6296
  const int64_t n_embd_head = hparams.n_embd_head_v;
5712
6297
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5713
6298
  GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5775,6 +6360,14 @@ struct llm_build_context {
5775
6360
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5776
6361
  }
5777
6362
 
6363
+ if (il == n_layer - 1) {
6364
+ // skip computing output for unused tokens
6365
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6366
+ n_tokens = n_outputs;
6367
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6368
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6369
+ }
6370
+
5778
6371
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5779
6372
  cb(ffn_inp, "ffn_inp", il);
5780
6373
 
@@ -5827,19 +6420,19 @@ struct llm_build_context {
5827
6420
  for (int i = 0; i < n_expert_used; ++i) {
5828
6421
  ggml_tensor * cur_expert;
5829
6422
 
5830
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
6423
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
5831
6424
  cb(cur_up, "ffn_moe_up", il);
5832
6425
 
5833
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
6426
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
5834
6427
  cb(cur_gate, "ffn_moe_gate", il);
5835
6428
 
5836
6429
  cur_gate = ggml_silu(ctx0, cur_gate);
5837
6430
  cb(cur_gate, "ffn_moe_silu", il);
5838
6431
 
5839
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
6432
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
5840
6433
  cb(cur_expert, "ffn_moe_gate_par", il);
5841
6434
 
5842
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6435
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
5843
6436
  cb(cur_expert, "ffn_moe_down", il);
5844
6437
 
5845
6438
  cur_expert = ggml_mul(ctx0, cur_expert,
@@ -5858,6 +6451,12 @@ struct llm_build_context {
5858
6451
  }
5859
6452
 
5860
6453
  cur = ggml_add(ctx0, cur, ffn_inp);
6454
+ cb(cur, "ffn_out", il);
6455
+
6456
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
6457
+ if (layer_dir != nullptr) {
6458
+ cur = ggml_add(ctx0, cur, layer_dir);
6459
+ }
5861
6460
  cb(cur, "l_out", il);
5862
6461
 
5863
6462
  // input for next layer
@@ -5893,7 +6492,7 @@ struct llm_build_context {
5893
6492
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5894
6493
 
5895
6494
  // inp_pos - contains the positions
5896
- struct ggml_tensor * inp_pos = build_inp_pos();
6495
+ struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
5897
6496
 
5898
6497
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5899
6498
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -5943,12 +6542,18 @@ struct llm_build_context {
5943
6542
  cb(Qcur, "Qcur", il);
5944
6543
  cb(Kcur, "Kcur", il);
5945
6544
 
5946
-
5947
6545
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5948
6546
  model.layers[il].wo, NULL,
5949
6547
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5950
6548
  }
5951
6549
 
6550
+ if (il == n_layer - 1) {
6551
+ // skip computing output for unused tokens
6552
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6553
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6554
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6555
+ }
6556
+
5952
6557
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5953
6558
  cb(ffn_inp, "ffn_inp", il);
5954
6559
 
@@ -5991,6 +6596,111 @@ struct llm_build_context {
5991
6596
  return gf;
5992
6597
  }
5993
6598
 
6599
+ struct ggml_cgraph * build_xverse() {
6600
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6601
+
6602
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6603
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6604
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6605
+
6606
+ struct ggml_tensor * cur;
6607
+ struct ggml_tensor * inpL;
6608
+
6609
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6610
+
6611
+ // inp_pos - contains the positions
6612
+ struct ggml_tensor * inp_pos = build_inp_pos();
6613
+
6614
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6615
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6616
+
6617
+ // positions of the tokens in the KV cache
6618
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6619
+
6620
+ for (int il = 0; il < n_layer; ++il) {
6621
+ struct ggml_tensor * inpSA = inpL;
6622
+
6623
+ cur = llm_build_norm(ctx0, inpL, hparams,
6624
+ model.layers[il].attn_norm, NULL,
6625
+ LLM_NORM_RMS, cb, il);
6626
+ cb(cur, "attn_norm", il);
6627
+
6628
+ // self-attention
6629
+ {
6630
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6631
+ cb(Qcur, "Qcur", il);
6632
+
6633
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6634
+ cb(Kcur, "Kcur", il);
6635
+
6636
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6637
+ cb(Vcur, "Vcur", il);
6638
+
6639
+ Qcur = ggml_rope_custom(
6640
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6641
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6642
+ ext_factor, attn_factor, beta_fast, beta_slow
6643
+ );
6644
+ cb(Qcur, "Qcur", il);
6645
+
6646
+ Kcur = ggml_rope_custom(
6647
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6648
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6649
+ ext_factor, attn_factor, beta_fast, beta_slow
6650
+ );
6651
+ cb(Kcur, "Kcur", il);
6652
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6653
+ model.layers[il].wo, NULL,
6654
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6655
+ }
6656
+
6657
+ if (il == n_layer - 1) {
6658
+ // skip computing output for unused tokens
6659
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6660
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6661
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6662
+ }
6663
+
6664
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6665
+ cb(ffn_inp, "ffn_inp", il);
6666
+
6667
+ // feed-forward network
6668
+ {
6669
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6670
+ model.layers[il].ffn_norm, NULL,
6671
+ LLM_NORM_RMS, cb, il);
6672
+ cb(cur, "ffn_norm", il);
6673
+
6674
+ cur = llm_build_ffn(ctx0, cur,
6675
+ model.layers[il].ffn_up, NULL,
6676
+ model.layers[il].ffn_gate, NULL,
6677
+ model.layers[il].ffn_down, NULL,
6678
+ NULL,
6679
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6680
+ cb(cur, "ffn_out", il);
6681
+ }
6682
+
6683
+ cur = ggml_add(ctx0, cur, ffn_inp);
6684
+ cb(cur, "l_out", il);
6685
+
6686
+ // input for next layer
6687
+ inpL = cur;
6688
+ }
6689
+
6690
+ cur = inpL;
6691
+
6692
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
6693
+ cb(cur, "result_norm", -1);
6694
+
6695
+ // lm_head
6696
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6697
+ cb(cur, "result_output", -1);
6698
+
6699
+ ggml_build_forward_expand(gf, cur);
6700
+
6701
+ return gf;
6702
+ }
6703
+
5994
6704
  struct ggml_cgraph * build_falcon() {
5995
6705
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5996
6706
 
@@ -6064,6 +6774,14 @@ struct llm_build_context {
6064
6774
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6065
6775
  }
6066
6776
 
6777
+ if (il == n_layer - 1) {
6778
+ // skip computing output for unused tokens
6779
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6780
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6781
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6782
+ attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
6783
+ }
6784
+
6067
6785
  struct ggml_tensor * ffn_inp = cur;
6068
6786
 
6069
6787
  // feed forward
@@ -6104,6 +6822,214 @@ struct llm_build_context {
6104
6822
  return gf;
6105
6823
  }
6106
6824
 
6825
+ struct ggml_cgraph * build_grok() {
6826
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6827
+
6828
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6829
+ int32_t n_tokens = this->n_tokens;
6830
+
6831
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6832
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6833
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6834
+
6835
+ struct ggml_tensor * cur;
6836
+ struct ggml_tensor * inpL;
6837
+
6838
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6839
+
6840
+ // multiply by embedding_multiplier_scale of 78.38367176906169
6841
+ inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
6842
+
6843
+ // inp_pos - contains the positions
6844
+ struct ggml_tensor * inp_pos = build_inp_pos();
6845
+
6846
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6847
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6848
+
6849
+ for (int il = 0; il < n_layer; ++il) {
6850
+ struct ggml_tensor * inpSA = inpL;
6851
+
6852
+ // norm
6853
+ cur = llm_build_norm(ctx0, inpL, hparams,
6854
+ model.layers[il].attn_norm, NULL,
6855
+ LLM_NORM_RMS, cb, il);
6856
+ cb(cur, "attn_norm", il);
6857
+
6858
+
6859
+ // self-attention
6860
+ {
6861
+ // compute Q and K and RoPE them
6862
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6863
+ cb(Qcur, "Qcur", il);
6864
+ if (model.layers[il].bq) {
6865
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6866
+ cb(Qcur, "Qcur", il);
6867
+ }
6868
+
6869
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6870
+ cb(Kcur, "Kcur", il);
6871
+ if (model.layers[il].bk) {
6872
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6873
+ cb(Kcur, "Kcur", il);
6874
+ }
6875
+
6876
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6877
+ cb(Vcur, "Vcur", il);
6878
+ if (model.layers[il].bv) {
6879
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6880
+ cb(Vcur, "Vcur", il);
6881
+ }
6882
+
6883
+ Qcur = ggml_rope_custom(
6884
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6885
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6886
+ ext_factor, attn_factor, beta_fast, beta_slow
6887
+ );
6888
+ cb(Qcur, "Qcur", il);
6889
+
6890
+ Kcur = ggml_rope_custom(
6891
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6892
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6893
+ ext_factor, attn_factor, beta_fast, beta_slow
6894
+ );
6895
+ cb(Kcur, "Kcur", il);
6896
+
6897
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6898
+ model.layers[il].wo, model.layers[il].bo,
6899
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6900
+ }
6901
+
6902
+ if (il == n_layer - 1) {
6903
+ // skip computing output for unused tokens
6904
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6905
+ n_tokens = n_outputs;
6906
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6907
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6908
+ }
6909
+
6910
+ // Grok
6911
+ // if attn_out_norm is present then apply it before adding the input
6912
+ if (model.layers[il].attn_out_norm) {
6913
+ cur = llm_build_norm(ctx0, cur, hparams,
6914
+ model.layers[il].attn_out_norm, NULL,
6915
+ LLM_NORM_RMS, cb, il);
6916
+ cb(cur, "attn_out_norm", il);
6917
+ }
6918
+
6919
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6920
+ cb(ffn_inp, "ffn_inp", il);
6921
+
6922
+ // feed-forward network
6923
+ // MoE branch
6924
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6925
+ model.layers[il].ffn_norm, NULL,
6926
+ LLM_NORM_RMS, cb, il);
6927
+ cb(cur, "ffn_norm", il);
6928
+
6929
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6930
+ cb(logits, "ffn_moe_logits", il);
6931
+
6932
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6933
+ cb(probs, "ffn_moe_probs", il);
6934
+
6935
+ // select experts
6936
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6937
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6938
+
6939
+ ggml_tensor * weights = ggml_get_rows(ctx0,
6940
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6941
+ cb(weights, "ffn_moe_weights", il);
6942
+
6943
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6944
+
6945
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6946
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6947
+
6948
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6949
+ cb(weights, "ffn_moe_weights_norm", il);
6950
+
6951
+ // compute expert outputs
6952
+ ggml_tensor * moe_out = nullptr;
6953
+
6954
+ for (int i = 0; i < n_expert_used; ++i) {
6955
+ ggml_tensor * cur_expert;
6956
+
6957
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6958
+ cb(cur_up, "ffn_moe_up", il);
6959
+
6960
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6961
+ cb(cur_gate, "ffn_moe_gate", il);
6962
+
6963
+ //GeLU
6964
+ cur_gate = ggml_gelu(ctx0, cur_gate);
6965
+ cb(cur_gate, "ffn_moe_gelu", il);
6966
+
6967
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6968
+ cb(cur_expert, "ffn_moe_gate_par", il);
6969
+
6970
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6971
+ cb(cur_expert, "ffn_moe_down", il);
6972
+
6973
+ cur_expert = ggml_mul(ctx0, cur_expert,
6974
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6975
+ cb(cur_expert, "ffn_moe_weighted", il);
6976
+
6977
+ if (i == 0) {
6978
+ moe_out = cur_expert;
6979
+ } else {
6980
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
6981
+ cb(moe_out, "ffn_moe_out", il);
6982
+ }
6983
+ }
6984
+
6985
+ cur = moe_out;
6986
+
6987
+ // Grok
6988
+ // if layer_out_norm is present then apply it before adding the input
6989
+ // Idea: maybe ffn_out_norm is a better name
6990
+ if (model.layers[il].layer_out_norm) {
6991
+ cur = llm_build_norm(ctx0, cur, hparams,
6992
+ model.layers[il].layer_out_norm, NULL,
6993
+ LLM_NORM_RMS, cb, il);
6994
+ cb(cur, "layer_out_norm", il);
6995
+ }
6996
+
6997
+
6998
+ cur = ggml_add(ctx0, cur, ffn_inp);
6999
+ cb(cur, "ffn_out", il);
7000
+
7001
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7002
+ if (layer_dir != nullptr) {
7003
+ cur = ggml_add(ctx0, cur, layer_dir);
7004
+ }
7005
+ cb(cur, "l_out", il);
7006
+
7007
+ // input for next layer
7008
+ inpL = cur;
7009
+ }
7010
+
7011
+ cur = inpL;
7012
+
7013
+ cur = llm_build_norm(ctx0, cur, hparams,
7014
+ model.output_norm, NULL,
7015
+ LLM_NORM_RMS, cb, -1);
7016
+ cb(cur, "result_norm", -1);
7017
+
7018
+ // lm_head
7019
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7020
+
7021
+ // Grok
7022
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
7023
+
7024
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7025
+
7026
+ cb(cur, "result_output", -1);
7027
+
7028
+ ggml_build_forward_expand(gf, cur);
7029
+
7030
+ return gf;
7031
+ }
7032
+
6107
7033
  struct ggml_cgraph * build_starcoder() {
6108
7034
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6109
7035
 
@@ -6158,6 +7084,13 @@ struct llm_build_context {
6158
7084
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6159
7085
  }
6160
7086
 
7087
+ if (il == n_layer - 1) {
7088
+ // skip computing output for unused tokens
7089
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7090
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7091
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7092
+ }
7093
+
6161
7094
  // add the input
6162
7095
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6163
7096
  cb(ffn_inp, "ffn_inp", il);
@@ -6355,6 +7288,13 @@ struct llm_build_context {
6355
7288
  Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6356
7289
  }
6357
7290
 
7291
+ if (il == n_layer - 1) {
7292
+ // skip computing output for unused tokens
7293
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7294
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7295
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
7296
+ }
7297
+
6358
7298
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
6359
7299
  cb(ffn_inp, "ffn_inp", il);
6360
7300
 
@@ -6444,6 +7384,13 @@ struct llm_build_context {
6444
7384
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6445
7385
  }
6446
7386
 
7387
+ if (il == n_layer - 1) {
7388
+ // skip computing output for unused tokens
7389
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7390
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7391
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7392
+ }
7393
+
6447
7394
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6448
7395
  cb(ffn_inp, "ffn_inp", il);
6449
7396
 
@@ -6601,6 +7548,13 @@ struct llm_build_context {
6601
7548
  }
6602
7549
  cb(cur, "kqv_out", il);
6603
7550
 
7551
+ if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
7552
+ // skip computing output for unused tokens
7553
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7554
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7555
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7556
+ }
7557
+
6604
7558
  // re-add the layer input
6605
7559
  cur = ggml_add(ctx0, cur, inpL);
6606
7560
 
@@ -6723,6 +7677,13 @@ struct llm_build_context {
6723
7677
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6724
7678
  }
6725
7679
 
7680
+ if (il == n_layer - 1) {
7681
+ // skip computing output for unused tokens
7682
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7683
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7684
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7685
+ }
7686
+
6726
7687
  // Add the input
6727
7688
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6728
7689
  cb(ffn_inp, "ffn_inp", il);
@@ -6770,6 +7731,7 @@ struct llm_build_context {
6770
7731
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6771
7732
 
6772
7733
  struct ggml_tensor * cur;
7734
+ struct ggml_tensor * pos;
6773
7735
  struct ggml_tensor * inpL;
6774
7736
 
6775
7737
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -6780,6 +7742,16 @@ struct llm_build_context {
6780
7742
  // positions of the tokens in the KV cache
6781
7743
  struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6782
7744
 
7745
+ if (model.pos_embd) {
7746
+ // inp_pos - contains the positions
7747
+ struct ggml_tensor * inp_pos = build_inp_pos();
7748
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7749
+ cb(pos, "pos_embd", -1);
7750
+
7751
+ inpL = ggml_add(ctx0, inpL, pos);
7752
+ cb(inpL, "inpL", -1);
7753
+ }
7754
+
6783
7755
  for (int il = 0; il < n_layer; ++il) {
6784
7756
  struct ggml_tensor * attn_norm;
6785
7757
 
@@ -6814,11 +7786,39 @@ struct llm_build_context {
6814
7786
  cb(Kcur, "Kcur", il);
6815
7787
  cb(Vcur, "Vcur", il);
6816
7788
 
6817
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7789
+ // Q/K Layernorm
7790
+ if (model.layers[il].attn_q_norm) {
7791
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
7792
+ model.layers[il].attn_q_norm,
7793
+ model.layers[il].attn_q_norm_b,
7794
+ LLM_NORM, cb, il);
7795
+ cb(Qcur, "Qcur", il);
6818
7796
 
6819
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7797
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
7798
+ model.layers[il].attn_k_norm,
7799
+ model.layers[il].attn_k_norm_b,
7800
+ LLM_NORM, cb, il);
7801
+ cb(Kcur, "Kcur", il);
7802
+
7803
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7804
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7805
+
7806
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6820
7807
  model.layers[il].wo, model.layers[il].bo,
6821
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7808
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7809
+ } else {
7810
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7811
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7812
+ model.layers[il].wo, model.layers[il].bo,
7813
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7814
+ }
7815
+ }
7816
+
7817
+ if (il == n_layer - 1) {
7818
+ // skip computing output for unused tokens
7819
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7820
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7821
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6822
7822
  }
6823
7823
 
6824
7824
  // Add the input
@@ -6934,6 +7934,13 @@ struct llm_build_context {
6934
7934
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6935
7935
  }
6936
7936
 
7937
+ if (il == n_layer - 1) {
7938
+ // skip computing output for unused tokens
7939
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7940
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7941
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7942
+ }
7943
+
6937
7944
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6938
7945
  cb(ffn_inp, "ffn_inp", il);
6939
7946
 
@@ -7040,6 +8047,13 @@ struct llm_build_context {
7040
8047
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7041
8048
  }
7042
8049
 
8050
+ if (il == n_layer - 1) {
8051
+ // skip computing output for unused tokens
8052
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8053
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8054
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8055
+ }
8056
+
7043
8057
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7044
8058
  cb(ffn_inp, "ffn_inp", il);
7045
8059
 
@@ -7152,6 +8166,13 @@ struct llm_build_context {
7152
8166
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7153
8167
  }
7154
8168
 
8169
+ if (il == n_layer - 1) {
8170
+ // skip computing output for unused tokens
8171
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8172
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8173
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8174
+ }
8175
+
7155
8176
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7156
8177
  cb(ffn_inp, "ffn_inp", il);
7157
8178
 
@@ -7270,6 +8291,14 @@ struct llm_build_context {
7270
8291
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7271
8292
  }
7272
8293
 
8294
+ if (il == n_layer - 1) {
8295
+ // skip computing output for unused tokens
8296
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8297
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8298
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8299
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
8300
+ }
8301
+
7273
8302
  // FF
7274
8303
  {
7275
8304
  ffn_output = llm_build_ffn(ctx0, attn_norm_output,
@@ -7367,6 +8396,14 @@ struct llm_build_context {
7367
8396
 
7368
8397
  cur = attention_norm;
7369
8398
 
8399
+ if (il == n_layer - 1) {
8400
+ // skip computing output for unused tokens
8401
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8402
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8403
+ sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
8404
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8405
+ }
8406
+
7370
8407
  // feed-forward network
7371
8408
  {
7372
8409
  cur = llm_build_ffn(ctx0, cur,
@@ -7459,6 +8496,13 @@ struct llm_build_context {
7459
8496
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7460
8497
  }
7461
8498
 
8499
+ if (il == n_layer - 1) {
8500
+ // skip computing output for unused tokens
8501
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8502
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8503
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8504
+ }
8505
+
7462
8506
  // add the input
7463
8507
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7464
8508
  cb(ffn_inp, "ffn_inp", il);
@@ -7559,6 +8603,13 @@ struct llm_build_context {
7559
8603
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7560
8604
  }
7561
8605
 
8606
+ if (il == n_layer - 1) {
8607
+ // skip computing output for unused tokens
8608
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8609
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8610
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8611
+ }
8612
+
7562
8613
  // add the input
7563
8614
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7564
8615
  cb(ffn_inp, "ffn_inp", il);
@@ -7668,6 +8719,13 @@ struct llm_build_context {
7668
8719
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7669
8720
  }
7670
8721
 
8722
+ if (il == n_layer - 1) {
8723
+ // skip computing output for unused tokens
8724
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8725
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8726
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8727
+ }
8728
+
7671
8729
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7672
8730
  cb(ffn_inp, "ffn_inp", il);
7673
8731
 
@@ -7778,6 +8836,13 @@ struct llm_build_context {
7778
8836
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7779
8837
  }
7780
8838
 
8839
+ if (il == n_layer - 1) {
8840
+ // skip computing output for unused tokens
8841
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8842
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8843
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8844
+ }
8845
+
7781
8846
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7782
8847
  cb(ffn_inp, "ffn_inp", il);
7783
8848
 
@@ -7901,6 +8966,13 @@ struct llm_build_context {
7901
8966
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7902
8967
  }
7903
8968
 
8969
+ if (il == n_layer - 1) {
8970
+ // skip computing output for unused tokens
8971
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8972
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8973
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8974
+ }
8975
+
7904
8976
  // scale_res - scale the hidden states for residual connection
7905
8977
  const float scale_res = scale_depth/sqrtf(float(n_layer));
7906
8978
  cur = ggml_scale(ctx0, cur, scale_res);
@@ -8015,6 +9087,13 @@ struct llm_build_context {
8015
9087
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8016
9088
  }
8017
9089
 
9090
+ if (il == n_layer - 1) {
9091
+ // skip computing output for unused tokens
9092
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9093
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9094
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9095
+ }
9096
+
8018
9097
  struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8019
9098
  cb(sa_out, "sa_out", il);
8020
9099
 
@@ -8125,7 +9204,13 @@ struct llm_build_context {
8125
9204
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8126
9205
  model.layers[il].wo, model.layers[il].bo,
8127
9206
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8128
- cb(cur, "kqv_out", il);
9207
+ }
9208
+
9209
+ if (il == n_layer - 1) {
9210
+ // skip computing output for unused tokens
9211
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9212
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9213
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8129
9214
  }
8130
9215
 
8131
9216
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -8275,6 +9360,15 @@ struct llm_build_context {
8275
9360
 
8276
9361
  struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
8277
9362
 
9363
+ if (il == n_layer - 1) {
9364
+ // skip computing output for unused tokens
9365
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9366
+ x = ggml_get_rows(ctx0, x, inp_out_ids);
9367
+ y = ggml_get_rows(ctx0, y, inp_out_ids);
9368
+ z = ggml_get_rows(ctx0, z, inp_out_ids);
9369
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9370
+ }
9371
+
8278
9372
  // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
8279
9373
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
8280
9374
  y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
@@ -8305,6 +9399,129 @@ struct llm_build_context {
8305
9399
 
8306
9400
  return gf;
8307
9401
  }
9402
+
9403
+ struct ggml_cgraph * build_command_r() {
9404
+
9405
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9406
+
9407
+ const int64_t n_embd_head = hparams.n_embd_head_v;
9408
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9409
+ const float f_logit_scale = hparams.f_logit_scale;
9410
+
9411
+ struct ggml_tensor * cur;
9412
+ struct ggml_tensor * inpL;
9413
+
9414
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
9415
+
9416
+ // inp_pos - contains the positions
9417
+ struct ggml_tensor * inp_pos = build_inp_pos();
9418
+
9419
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
9420
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
9421
+
9422
+ for (int il = 0; il < n_layer; ++il) {
9423
+
9424
+ // norm
9425
+ cur = llm_build_norm(ctx0, inpL, hparams,
9426
+ model.layers[il].attn_norm, NULL,
9427
+ LLM_NORM, cb, il);
9428
+ cb(cur, "attn_norm", il);
9429
+ struct ggml_tensor * ffn_inp = cur;
9430
+
9431
+ // self-attention
9432
+ {
9433
+ // compute Q and K and RoPE them
9434
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
9435
+ cb(Qcur, "Qcur", il);
9436
+ if (model.layers[il].bq) {
9437
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
9438
+ cb(Qcur, "Qcur", il);
9439
+ }
9440
+
9441
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
9442
+ cb(Kcur, "Kcur", il);
9443
+ if (model.layers[il].bk) {
9444
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
9445
+ cb(Kcur, "Kcur", il);
9446
+ }
9447
+
9448
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9449
+ cb(Vcur, "Vcur", il);
9450
+ if (model.layers[il].bv) {
9451
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9452
+ cb(Vcur, "Vcur", il);
9453
+ }
9454
+
9455
+ Qcur = ggml_rope_custom(
9456
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9457
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9458
+ ext_factor, attn_factor, beta_fast, beta_slow
9459
+ );
9460
+ cb(Qcur, "Qcur", il);
9461
+
9462
+ Kcur = ggml_rope_custom(
9463
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9464
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9465
+ ext_factor, attn_factor, beta_fast, beta_slow
9466
+ );
9467
+ cb(Kcur, "Kcur", il);
9468
+
9469
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9470
+ model.layers[il].wo, model.layers[il].bo,
9471
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9472
+ }
9473
+
9474
+ if (il == n_layer - 1) {
9475
+ // skip computing output for unused tokens
9476
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9477
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9478
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9479
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
9480
+ }
9481
+
9482
+ struct ggml_tensor * attn_out = cur;
9483
+
9484
+ // feed-forward network
9485
+ {
9486
+ cur = llm_build_ffn(ctx0, ffn_inp,
9487
+ model.layers[il].ffn_up, NULL,
9488
+ model.layers[il].ffn_gate, NULL,
9489
+ model.layers[il].ffn_down, NULL,
9490
+ NULL,
9491
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
9492
+ cb(cur, "ffn_out", il);
9493
+ }
9494
+
9495
+ // add together residual + FFN + self-attention
9496
+ cur = ggml_add(ctx0, cur, inpL);
9497
+ cur = ggml_add(ctx0, cur, attn_out);
9498
+ cb(cur, "l_out", il);
9499
+
9500
+ // input for next layer
9501
+ inpL = cur;
9502
+ }
9503
+
9504
+ cur = inpL;
9505
+
9506
+ cur = llm_build_norm(ctx0, cur, hparams,
9507
+ model.output_norm, NULL,
9508
+ LLM_NORM, cb, -1);
9509
+ cb(cur, "result_norm", -1);
9510
+
9511
+ // lm_head
9512
+ cur = ggml_mul_mat(ctx0, model.output, cur);
9513
+
9514
+ if (f_logit_scale) {
9515
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
9516
+ }
9517
+
9518
+ cb(cur, "result_output", -1);
9519
+
9520
+ ggml_build_forward_expand(gf, cur);
9521
+
9522
+ return gf;
9523
+
9524
+ }
8308
9525
  };
8309
9526
 
8310
9527
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -8380,12 +9597,15 @@ static struct ggml_cgraph * llama_build_graph(
8380
9597
  }
8381
9598
 
8382
9599
  // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
8383
- // to fix this, we assign the norm layer manually to the backend of its layer
8384
- if (il != -1 && strcmp(name, "norm") == 0) {
8385
- for (auto * backend : lctx.backends) {
8386
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
8387
- ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
8388
- break;
9600
+ // FIXME: fix in ggml_backend_sched
9601
+ const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
9602
+ if (batch.n_tokens < 32 || full_offload) {
9603
+ if (il != -1 && strcmp(name, "norm") == 0) {
9604
+ for (auto * backend : lctx.backends) {
9605
+ if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
9606
+ ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
9607
+ break;
9608
+ }
8389
9609
  }
8390
9610
  }
8391
9611
  }
@@ -8410,6 +9630,10 @@ static struct ggml_cgraph * llama_build_graph(
8410
9630
  {
8411
9631
  result = llm.build_falcon();
8412
9632
  } break;
9633
+ case LLM_ARCH_GROK:
9634
+ {
9635
+ result = llm.build_grok();
9636
+ } break;
8413
9637
  case LLM_ARCH_STARCODER:
8414
9638
  {
8415
9639
  result = llm.build_starcoder();
@@ -8487,6 +9711,14 @@ static struct ggml_cgraph * llama_build_graph(
8487
9711
  {
8488
9712
  result = llm.build_mamba();
8489
9713
  } break;
9714
+ case LLM_ARCH_XVERSE:
9715
+ {
9716
+ result = llm.build_xverse();
9717
+ } break;
9718
+ case LLM_ARCH_COMMAND_R:
9719
+ {
9720
+ result = llm.build_command_r();
9721
+ } break;
8490
9722
  default:
8491
9723
  GGML_ASSERT(false);
8492
9724
  }
@@ -8548,9 +9780,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8548
9780
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
8549
9781
  }
8550
9782
 
9783
+ if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
9784
+ GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
9785
+ const int64_t n_tokens = batch.n_tokens;
9786
+
9787
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
9788
+ int32_t * data = (int32_t *) lctx.inp_out_ids->data;
9789
+
9790
+ if (lctx.n_outputs == n_tokens) {
9791
+ for (int i = 0; i < n_tokens; ++i) {
9792
+ data[i] = i;
9793
+ }
9794
+ } else if (batch.logits) {
9795
+ int32_t n_outputs = 0;
9796
+ for (int i = 0; i < n_tokens; ++i) {
9797
+ if (batch.logits[i]) {
9798
+ data[n_outputs++] = i;
9799
+ }
9800
+ }
9801
+ // the graph needs to have been passed the correct number of outputs
9802
+ GGML_ASSERT(lctx.n_outputs == n_outputs);
9803
+ } else if (lctx.n_outputs == 1) {
9804
+ // only keep last output
9805
+ data[0] = n_tokens - 1;
9806
+ } else {
9807
+ GGML_ASSERT(lctx.n_outputs == 0);
9808
+ }
9809
+ }
9810
+
8551
9811
  GGML_ASSERT(
9812
+ // (!a || b) is a logical implication (a -> b)
9813
+ // !hparams.causal_attn -> !cparams.causal_attn
8552
9814
  (hparams.causal_attn || !cparams.causal_attn) &&
8553
- "non-causal attention with generative models is not supported"
9815
+ "causal attention with embedding models is not supported"
8554
9816
  );
8555
9817
 
8556
9818
  if (lctx.inp_KQ_mask) {
@@ -8729,6 +9991,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8729
9991
  }
8730
9992
  }
8731
9993
 
9994
+ // Make sure enough space is available for outputs.
9995
+ // Returns max number of outputs for which space was reserved.
9996
+ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
9997
+ const auto & cparams = lctx.cparams;
9998
+ const auto & hparams = lctx.model.hparams;
9999
+
10000
+ const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
10001
+
10002
+ const auto n_batch = cparams.n_batch;
10003
+ const auto n_vocab = hparams.n_vocab;
10004
+ const auto n_embd = hparams.n_embd;
10005
+
10006
+ // TODO: use a per-batch flag for logits presence instead
10007
+ const bool has_logits = cparams.causal_attn;
10008
+ const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
10009
+
10010
+ const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
10011
+ const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
10012
+
10013
+ if (lctx.output_ids.empty()) {
10014
+ // init, never resized afterwards
10015
+ lctx.output_ids.resize(n_batch);
10016
+ }
10017
+
10018
+ const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
10019
+ const size_t new_size = (logits_size + embd_size) * sizeof(float);
10020
+
10021
+ // alloc only when more than the current capacity is required
10022
+ // TODO: also consider shrinking the buffer
10023
+ if (!lctx.buf_output || prev_size < new_size) {
10024
+ if (lctx.buf_output) {
10025
+ #ifndef NDEBUG
10026
+ // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
10027
+ LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
10028
+ #endif
10029
+ ggml_backend_buffer_free(lctx.buf_output);
10030
+ lctx.buf_output = nullptr;
10031
+ lctx.logits = nullptr;
10032
+ lctx.embd = nullptr;
10033
+ }
10034
+
10035
+ lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
10036
+ if (lctx.buf_output == nullptr) {
10037
+ LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
10038
+ return 0;
10039
+ }
10040
+ }
10041
+
10042
+ float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
10043
+
10044
+ lctx.logits = has_logits ? output_base : nullptr;
10045
+ lctx.embd = has_embd ? output_base + logits_size : nullptr;
10046
+
10047
+ lctx.output_size = n_outputs_max;
10048
+ lctx.logits_size = logits_size;
10049
+ lctx.embd_size = embd_size;
10050
+
10051
+ // set all ids as invalid (negative)
10052
+ std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
10053
+
10054
+ ggml_backend_buffer_clear(lctx.buf_output, 0);
10055
+
10056
+ lctx.n_outputs = 0;
10057
+
10058
+ return n_outputs_max;
10059
+ }
10060
+
10061
+
8732
10062
  static void llama_graph_compute(
8733
10063
  llama_context & lctx,
8734
10064
  ggml_cgraph * gf,
@@ -8804,16 +10134,8 @@ static int llama_decode_internal(
8804
10134
  const int64_t n_embd = hparams.n_embd;
8805
10135
  const int64_t n_vocab = hparams.n_vocab;
8806
10136
 
8807
-
8808
- auto * logits_out = lctx.logits;
8809
-
8810
- #ifndef NDEBUG
8811
- auto & logits_valid = lctx.logits_valid;
8812
- logits_valid.clear();
8813
- logits_valid.resize(n_tokens_all);
8814
-
8815
- memset(logits_out, 0, lctx.logits_size*sizeof(float));
8816
- #endif
10137
+ uint32_t n_outputs = 0;
10138
+ uint32_t n_outputs_prev = 0;
8817
10139
 
8818
10140
  const auto n_ubatch = cparams.n_ubatch;
8819
10141
 
@@ -8822,6 +10144,38 @@ static int llama_decode_internal(
8822
10144
  std::vector<llama_seq_id *> seq_id_arr;
8823
10145
  std::vector<std::vector<llama_seq_id>> seq_id;
8824
10146
 
10147
+ // count outputs
10148
+ if (batch_all.logits) {
10149
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10150
+ n_outputs += batch_all.logits[i] != 0;
10151
+ }
10152
+ } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
10153
+ n_outputs = n_tokens_all;
10154
+ } else {
10155
+ // keep last output only
10156
+ n_outputs = 1;
10157
+ }
10158
+
10159
+ // reserve output buffer
10160
+ if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
10161
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
10162
+ return -2;
10163
+ };
10164
+
10165
+ // set output mappings
10166
+ if (batch_all.logits) {
10167
+ int32_t i_logits = 0;
10168
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10169
+ if (batch_all.logits[i]) {
10170
+ lctx.output_ids[i] = i_logits++;
10171
+ }
10172
+ }
10173
+ } else {
10174
+ for (uint32_t i = 0; i < n_outputs; ++i) {
10175
+ lctx.output_ids[i] = i;
10176
+ }
10177
+ }
10178
+
8825
10179
  for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
8826
10180
  const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
8827
10181
  llama_batch u_batch = {
@@ -8837,6 +10191,27 @@ static int llama_decode_internal(
8837
10191
  /* .all_seq_id = */ batch_all.all_seq_id,
8838
10192
  };
8839
10193
 
10194
+ // count the outputs in this u_batch
10195
+ {
10196
+ int32_t n_outputs_new = 0;
10197
+
10198
+ if (u_batch.logits) {
10199
+ for (uint32_t i = 0; i < n_tokens; i++) {
10200
+ n_outputs_new += u_batch.logits[i] != 0;
10201
+ }
10202
+ } else if (n_outputs == n_tokens_all) {
10203
+ n_outputs_new = n_tokens;
10204
+ } else {
10205
+ // keep last output only
10206
+ if (cur_token + n_tokens >= n_tokens_all) {
10207
+ n_outputs_new = 1;
10208
+ }
10209
+ }
10210
+
10211
+ // needs to happen before the graph is built
10212
+ lctx.n_outputs = n_outputs_new;
10213
+ }
10214
+
8840
10215
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8841
10216
  GGML_ASSERT(n_threads > 0);
8842
10217
 
@@ -8900,23 +10275,37 @@ static int llama_decode_internal(
8900
10275
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
8901
10276
  struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
8902
10277
 
8903
- if (!hparams.causal_attn) {
10278
+ if (lctx.n_outputs == 0) {
10279
+ // no output
10280
+ res = nullptr;
10281
+ embd = nullptr;
10282
+ } else if (!hparams.causal_attn) {
8904
10283
  res = nullptr; // do not extract logits for embedding models such as BERT
8905
10284
 
8906
10285
  // token or sequence embeddings
8907
10286
  embd = gf->nodes[gf->n_nodes - 1];
8908
10287
 
8909
10288
  GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
8910
- } else {
8911
- if (strcmp(res->name, "result_output") == 0) {
8912
- // the token embeddings could be the second to last tensor, or the third to last tensor
8913
- if (strcmp(embd->name, "result_norm") != 0) {
8914
- embd = gf->nodes[gf->n_nodes - 3];
8915
- GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
8916
- }
8917
- } else {
8918
- GGML_ASSERT(false && "missing result_output tensor");
10289
+ } else if (cparams.embeddings) {
10290
+ // the embeddings could be in the second to last tensor, or any of the previous tensors
10291
+ int i_embd = gf->n_nodes - 2;
10292
+ for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
10293
+ i_embd = gf->n_nodes - i;
10294
+ if (i_embd < 0) { break; }
10295
+ embd = gf->nodes[i_embd];
10296
+ }
10297
+ GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
10298
+
10299
+ // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
10300
+ if (!cparams.causal_attn) {
10301
+ res = nullptr; // do not extract logits when not needed
10302
+ // skip computing logits
10303
+ // TODO: is this safe?
10304
+ gf->n_nodes = i_embd + 1;
8919
10305
  }
10306
+ } else {
10307
+ embd = nullptr; // do not extract embeddings when not needed
10308
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
8920
10309
  }
8921
10310
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
8922
10311
 
@@ -8959,67 +10348,38 @@ static int llama_decode_internal(
8959
10348
  //}
8960
10349
 
8961
10350
  // extract logits
8962
- // TODO: do not compute and extract logits if only embeddings are needed
8963
- // update the graphs to skip "result_output" if logits are not needed
8964
10351
  if (res) {
8965
10352
  ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
8966
10353
  GGML_ASSERT(backend_res != nullptr);
8967
- if (u_batch.logits) {
8968
- int32_t i_first = -1;
8969
- for (uint32_t i = 0; i < n_tokens; i++) {
8970
- if (u_batch.logits[i] && i_first == -1) {
8971
- i_first = (int32_t) i;
8972
- }
8973
- if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
8974
- if (i_first != -1) {
8975
- int i_last = u_batch.logits[i] == 0 ? i : i + 1;
8976
- // extract logits for the range [i_first, i_last)
8977
- // group the requests to minimize the number of calls to the backend
8978
- ggml_backend_tensor_get_async(backend_res, res,
8979
- logits_out + n_vocab*(cur_token + i_first),
8980
- i_first*n_vocab*sizeof(float),
8981
- (i_last - i_first)*n_vocab*sizeof(float));
8982
- i_first = -1;
8983
- }
8984
- }
8985
- #ifndef NDEBUG
8986
- logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
8987
- #endif
8988
- }
8989
- } else if (lctx.logits_all) {
8990
- ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
8991
- #ifndef NDEBUG
8992
- std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
8993
- #endif
8994
- } else {
8995
- if (cur_token + n_tokens >= n_tokens_all) {
8996
- ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
8997
- #ifndef NDEBUG
8998
- logits_valid[0] = true;
8999
- #endif
9000
- }
10354
+ GGML_ASSERT(lctx.logits != nullptr);
10355
+
10356
+ float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
10357
+ const int32_t n_outputs_new = lctx.n_outputs;
10358
+
10359
+ if (n_outputs_new) {
10360
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10361
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
10362
+ ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
9001
10363
  }
9002
10364
  }
9003
10365
 
9004
10366
  // extract embeddings
9005
- if (cparams.embeddings && embd) {
10367
+ if (embd) {
9006
10368
  ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
9007
10369
  GGML_ASSERT(backend_embd != nullptr);
9008
10370
 
9009
10371
  switch (cparams.pooling_type) {
9010
10372
  case LLAMA_POOLING_TYPE_NONE:
9011
- {
9012
- // extract token embeddings
9013
- auto & embd_out = lctx.embd;
9014
-
9015
- if (u_batch.logits) {
9016
- //embd_out.resize(n_embd * n_tokens);
9017
- for (uint32_t i = 0; i < n_tokens; i++) {
9018
- if (u_batch.logits[i] == 0) {
9019
- continue;
9020
- }
9021
- ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
9022
- }
10373
+ {
10374
+ // extract token embeddings
10375
+ GGML_ASSERT(lctx.embd != nullptr);
10376
+ float * embd_out = lctx.embd + n_outputs_prev*n_embd;
10377
+ const int32_t n_outputs_new = lctx.n_outputs;
10378
+
10379
+ if (n_outputs_new) {
10380
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10381
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
10382
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
9023
10383
  }
9024
10384
  } break;
9025
10385
  case LLAMA_POOLING_TYPE_CLS:
@@ -9046,6 +10406,7 @@ static int llama_decode_internal(
9046
10406
  } break;
9047
10407
  }
9048
10408
  }
10409
+ n_outputs_prev += lctx.n_outputs;
9049
10410
  }
9050
10411
 
9051
10412
  // wait for the computation to finish (automatically done when obtaining the model output)
@@ -9976,7 +11337,7 @@ struct llm_tokenizer_wpm {
9976
11337
  if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
9977
11338
  continue;
9978
11339
  }
9979
- code = to_lower(code);
11340
+ code = unicode_tolower(code);
9980
11341
  if (type == CODEPOINT_TYPE_WHITESPACE) {
9981
11342
  code = ' ';
9982
11343
  }
@@ -9996,7 +11357,7 @@ struct llm_tokenizer_wpm {
9996
11357
  std::vector<std::string> words;
9997
11358
  while (r < new_str.size()) {
9998
11359
  // if is whitespace
9999
- if (isspace(new_str[r])) {
11360
+ if (isspace(new_str[r], std::locale::classic())) {
10000
11361
  if (r > l) words.push_back(new_str.substr(l, (r - l)));
10001
11362
  l = r + 1;
10002
11363
  r = l;
@@ -10010,18 +11371,12 @@ struct llm_tokenizer_wpm {
10010
11371
  return words;
10011
11372
  }
10012
11373
 
10013
- uint32_t to_lower(uint32_t code) {
10014
- static const std::locale locale("en_US.UTF-8");
10015
- #if defined(_WIN32)
10016
- if (code > 0xFFFF) {
10017
- return code;
10018
- }
10019
- #endif
10020
- return std::tolower(wchar_t(code), locale);
10021
- }
10022
-
10023
11374
  bool is_ascii_punct(uint32_t code) {
10024
- return code < 256 && ispunct(code);
11375
+ if (code > 0xFF) {
11376
+ return false;
11377
+ }
11378
+ auto c = char(static_cast<unsigned char>(code));
11379
+ return ispunct(c, std::locale::classic());
10025
11380
  }
10026
11381
 
10027
11382
  bool is_chinese_char(uint32_t cpt) {
@@ -10266,28 +11621,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
10266
11621
  // grammar - internal
10267
11622
  //
10268
11623
 
10269
- struct llama_partial_utf8 {
10270
- uint32_t value; // bit value so far (unshifted)
10271
- int n_remain; // num bytes remaining; -1 indicates invalid sequence
10272
- };
10273
-
10274
- struct llama_grammar {
10275
- const std::vector<std::vector<llama_grammar_element>> rules;
10276
- std::vector<std::vector<const llama_grammar_element *>> stacks;
10277
-
10278
- // buffer for partially generated UTF-8 sequence from accepted tokens
10279
- llama_partial_utf8 partial_utf8;
10280
- };
10281
-
10282
- struct llama_grammar_candidate {
10283
- size_t index;
10284
- const uint32_t * code_points;
10285
- llama_partial_utf8 partial_utf8;
10286
- };
10287
11624
 
10288
11625
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
10289
11626
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
10290
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
11627
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
10291
11628
  const std::string & src,
10292
11629
  llama_partial_utf8 partial_start) {
10293
11630
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -10489,7 +11826,7 @@ static void llama_grammar_advance_stack(
10489
11826
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
10490
11827
  // produces the N possible stacks if the given char is accepted at those
10491
11828
  // positions
10492
- static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11829
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
10493
11830
  const std::vector<std::vector<llama_grammar_element>> & rules,
10494
11831
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
10495
11832
  const uint32_t chr) {
@@ -11715,7 +13052,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11715
13052
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
11716
13053
  // for getting the current layer as I initially thought, and we need to resort to parsing the
11717
13054
  // tensor name.
11718
- n_layer /= n_expert;
11719
13055
  if (sscanf(name, "blk.%d.", &i_layer) != 1) {
11720
13056
  throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
11721
13057
  }
@@ -11729,30 +13065,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11729
13065
  // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
11730
13066
  // with the quantization of the output tensor
11731
13067
  if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
11732
- int nx = tensor->ne[0];
11733
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
11734
- new_type = GGML_TYPE_Q8_0;
11735
- }
11736
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11737
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11738
- new_type = GGML_TYPE_Q5_K;
11739
- }
11740
- else if (new_type != GGML_TYPE_Q8_0) {
11741
- new_type = GGML_TYPE_Q6_K;
13068
+ if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
13069
+ new_type = qs.params->output_tensor_type;
13070
+ } else {
13071
+ int nx = tensor->ne[0];
13072
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
13073
+ new_type = GGML_TYPE_Q8_0;
13074
+ }
13075
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
13076
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
13077
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13078
+ new_type = GGML_TYPE_Q5_K;
13079
+ }
13080
+ else if (new_type != GGML_TYPE_Q8_0) {
13081
+ new_type = GGML_TYPE_Q6_K;
13082
+ }
11742
13083
  }
11743
13084
  } else if (name == "token_embd.weight") {
11744
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
11745
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11746
- new_type = GGML_TYPE_Q2_K;
11747
- }
11748
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11749
- new_type = GGML_TYPE_IQ3_S;
11750
- }
11751
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11752
- new_type = GGML_TYPE_IQ3_S;
13085
+ if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
13086
+ new_type = qs.params->token_embedding_type;
13087
+ } else {
13088
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
13089
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13090
+ new_type = GGML_TYPE_Q2_K;
13091
+ }
13092
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13093
+ new_type = GGML_TYPE_IQ3_S;
13094
+ }
13095
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
13096
+ new_type = GGML_TYPE_IQ3_S;
13097
+ }
11753
13098
  }
11754
13099
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
11755
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13100
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
11756
13101
  if (name.find("attn_v.weight") != std::string::npos) {
11757
13102
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
11758
13103
  else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -11771,7 +13116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11771
13116
  if (qs.model.hparams.n_expert == 8) {
11772
13117
  new_type = GGML_TYPE_Q5_K;
11773
13118
  } else {
11774
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
13119
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
11775
13120
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
11776
13121
  }
11777
13122
  }
@@ -11785,13 +13130,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11785
13130
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11786
13131
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
11787
13132
  }
11788
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
11789
- new_type = GGML_TYPE_Q4_K;
11790
- }
11791
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
11792
- new_type = GGML_TYPE_Q4_K;
11793
- }
11794
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
13133
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
11795
13134
  new_type = GGML_TYPE_Q4_K;
11796
13135
  }
11797
13136
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
@@ -11944,7 +13283,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11944
13283
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
11945
13284
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
11946
13285
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
11947
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
13286
+ new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
13287
+ new_type == GGML_TYPE_IQ1_M) {
11948
13288
  int nx = tensor->ne[0];
11949
13289
  int ny = tensor->ne[1];
11950
13290
  if (nx % QK_K != 0) {
@@ -11962,6 +13302,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11962
13302
  case GGML_TYPE_IQ3_XXS:
11963
13303
  case GGML_TYPE_IQ3_S:
11964
13304
  case GGML_TYPE_IQ1_S:
13305
+ case GGML_TYPE_IQ1_M:
11965
13306
  case GGML_TYPE_Q2_K:
11966
13307
  case GGML_TYPE_Q3_K:
11967
13308
  case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
@@ -12043,6 +13384,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12043
13384
  case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
12044
13385
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
12045
13386
  case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
13387
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
12046
13388
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
12047
13389
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
12048
13390
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
@@ -12065,8 +13407,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12065
13407
  constexpr bool use_mmap = false;
12066
13408
  #endif
12067
13409
 
12068
- llama_model_loader ml(fname_inp, use_mmap, NULL);
12069
- ml.init_mapping(false); // no prefetching?
13410
+ llama_model_kv_override * kv_overrides = nullptr;
13411
+ if (params->kv_overrides) {
13412
+ auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
13413
+ kv_overrides = v->data();
13414
+ }
13415
+ llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
13416
+ ml.init_mappings(false); // no prefetching
12070
13417
 
12071
13418
  llama_model model;
12072
13419
  llm_load_arch(ml, model);
@@ -12090,36 +13437,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12090
13437
  struct gguf_context * ctx_out = gguf_init_empty();
12091
13438
 
12092
13439
  // copy the KV pairs from the input file
12093
- gguf_set_kv (ctx_out, ml.ctx_gguf);
13440
+ gguf_set_kv (ctx_out, ml.meta);
12094
13441
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
12095
13442
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
12096
13443
 
13444
+ if (params->kv_overrides) {
13445
+ const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
13446
+ for (auto & o : overrides) {
13447
+ if (o.key[0] == 0) break;
13448
+ if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
13449
+ gguf_set_val_f32(ctx_out, o.key, o.float_value);
13450
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
13451
+ gguf_set_val_i32(ctx_out, o.key, o.int_value);
13452
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
13453
+ gguf_set_val_bool(ctx_out, o.key, o.bool_value);
13454
+ } else {
13455
+ LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
13456
+ }
13457
+ }
13458
+ }
13459
+
12097
13460
  for (int i = 0; i < ml.n_tensors; ++i) {
12098
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13461
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12099
13462
 
12100
13463
  const std::string name = ggml_get_name(meta);
12101
13464
 
12102
13465
  // TODO: avoid hardcoded tensor names - use the TN_* constants
12103
13466
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
12104
13467
  ++qs.n_attention_wv;
12105
- }
12106
- else if (name.find("ffn_down") != std::string::npos) {
12107
- ++qs.n_ffn_down;
12108
- }
12109
- else if (name.find("ffn_gate") != std::string::npos) {
12110
- ++qs.n_ffn_gate;
12111
- }
12112
- else if (name.find("ffn_up") != std::string::npos) {
12113
- ++qs.n_ffn_up;
12114
- }
12115
- else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13468
+ } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
12116
13469
  qs.has_output = true;
12117
13470
  }
12118
13471
  }
12119
- if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
12120
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
12121
- __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
12122
- }
13472
+
13473
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13474
+
13475
+ // sanity checks
13476
+ GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
12123
13477
 
12124
13478
  size_t total_size_org = 0;
12125
13479
  size_t total_size_new = 0;
@@ -12135,7 +13489,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12135
13489
 
12136
13490
  // populate the original tensors so we get an initial meta data
12137
13491
  for (int i = 0; i < ml.n_tensors; ++i) {
12138
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13492
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12139
13493
  gguf_add_tensor(ctx_out, meta);
12140
13494
  }
12141
13495
 
@@ -12149,6 +13503,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12149
13503
  // placeholder for the meta data
12150
13504
  ::zeros(fout, meta_size);
12151
13505
 
13506
+ const auto tn = LLM_TN(model.arch);
13507
+
12152
13508
  for (int i = 0; i < ml.n_tensors; ++i) {
12153
13509
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
12154
13510
 
@@ -12171,8 +13527,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12171
13527
  // This used to be a regex, but <regex> has an extreme cost to compile times.
12172
13528
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
12173
13529
 
12174
- // quantize only 2D tensors
12175
- quantize &= (ggml_n_dims(tensor) == 2);
13530
+ // quantize only 2D and 3D tensors (experts)
13531
+ quantize &= (ggml_n_dims(tensor) >= 2);
12176
13532
  quantize &= params->quantize_output_tensor || name != "output.weight";
12177
13533
  quantize &= !params->only_copy;
12178
13534
 
@@ -12201,6 +13557,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12201
13557
  if (!params->pure && ggml_is_quantized(default_type)) {
12202
13558
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
12203
13559
  }
13560
+ else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13561
+ new_type = params->token_embedding_type;
13562
+ }
13563
+ else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13564
+ new_type = params->output_tensor_type;
13565
+ }
12204
13566
 
12205
13567
  // If we've decided to quantize to the same type the tensor is already
12206
13568
  // in then there's nothing to do.
@@ -12221,11 +13583,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12221
13583
  if (it == imatrix_data->end()) {
12222
13584
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
12223
13585
  } else {
12224
- if (it->second.size() == (size_t)tensor->ne[0]) {
13586
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
12225
13587
  imatrix = it->second.data();
12226
13588
  } else {
12227
13589
  LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
12228
- int(it->second.size()), int(tensor->ne[0]), tensor->name);
13590
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
13591
+
13592
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
13593
+ // this is a significant error and it may be good idea to abort the process if this happens,
13594
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
13595
+ // tok_embd should be ignored in this case, since it always causes this warning
13596
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
13597
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
13598
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
13599
+ }
12229
13600
  }
12230
13601
  }
12231
13602
  }
@@ -12233,6 +13604,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12233
13604
  new_type == GGML_TYPE_IQ2_XS ||
12234
13605
  new_type == GGML_TYPE_IQ2_S ||
12235
13606
  new_type == GGML_TYPE_IQ1_S ||
13607
+ (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
12236
13608
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
12237
13609
  LLAMA_LOG_ERROR("\n\n============================================================\n");
12238
13610
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -12261,15 +13633,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12261
13633
  new_data = work.data();
12262
13634
 
12263
13635
  const int n_per_row = tensor->ne[0];
12264
- const int nrows = nelements / n_per_row;
13636
+ const int nrows = tensor->ne[1];
12265
13637
 
12266
13638
  static const int min_chunk_size = 32 * 512;
12267
13639
  const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
12268
13640
 
12269
- const int nchunk = (nelements + chunk_size - 1)/chunk_size;
13641
+ const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13642
+ const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
12270
13643
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
12271
- new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
12272
13644
 
13645
+ // quantize each expert separately since they have different importance matrices
13646
+ new_size = 0;
13647
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
13648
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
13649
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
13650
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
13651
+
13652
+ new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
13653
+ }
12273
13654
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
12274
13655
  }
12275
13656
  total_size_org += ggml_nbytes(tensor);
@@ -12340,7 +13721,7 @@ static int llama_apply_lora_from_file_internal(
12340
13721
  if (path_base_model) {
12341
13722
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
12342
13723
  ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
12343
- ml->init_mapping(/*prefetch*/ false); // no prefetching
13724
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
12344
13725
  }
12345
13726
 
12346
13727
  struct tensor_meta {
@@ -12461,7 +13842,7 @@ static int llama_apply_lora_from_file_internal(
12461
13842
 
12462
13843
  ggml_tensor * base_t;
12463
13844
  if (ml) {
12464
- if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
13845
+ if (!ml->get_tensor_meta(base_name.c_str())) {
12465
13846
  LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
12466
13847
  return 1;
12467
13848
  }
@@ -12645,11 +14026,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12645
14026
  struct llama_model_quantize_params result = {
12646
14027
  /*.nthread =*/ 0,
12647
14028
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
14029
+ /*.output_tensor_type =*/ GGML_TYPE_COUNT,
14030
+ /*.token_embedding_type =*/ GGML_TYPE_COUNT,
12648
14031
  /*.allow_requantize =*/ false,
12649
14032
  /*.quantize_output_tensor =*/ true,
12650
14033
  /*.only_copy =*/ false,
12651
14034
  /*.pure =*/ false,
12652
14035
  /*.imatrix =*/ nullptr,
14036
+ /*.kv_overrides =*/ nullptr,
12653
14037
  };
12654
14038
 
12655
14039
  return result;
@@ -12658,7 +14042,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12658
14042
  size_t llama_max_devices(void) {
12659
14043
  #if defined(GGML_USE_METAL)
12660
14044
  return 1;
12661
- #elif defined(GGML_USE_CUBLAS)
14045
+ #elif defined(GGML_USE_CUDA)
12662
14046
  return GGML_CUDA_MAX_DEVICES;
12663
14047
  #elif defined(GGML_USE_SYCL)
12664
14048
  return GGML_SYCL_MAX_DEVICES;
@@ -12678,8 +14062,8 @@ bool llama_supports_mlock(void) {
12678
14062
  }
12679
14063
 
12680
14064
  bool llama_supports_gpu_offload(void) {
12681
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
12682
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
14065
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
14066
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
12683
14067
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
12684
14068
  return true;
12685
14069
  #else
@@ -12786,7 +14170,7 @@ struct llama_context * llama_new_context_with_model(
12786
14170
  const auto & hparams = model->hparams;
12787
14171
  auto & cparams = ctx->cparams;
12788
14172
 
12789
- // TODO: maybe add n_seq_max here too
14173
+ cparams.n_seq_max = std::max(1u, params.n_seq_max);
12790
14174
  cparams.n_threads = params.n_threads;
12791
14175
  cparams.n_threads_batch = params.n_threads_batch;
12792
14176
  cparams.yarn_ext_factor = params.yarn_ext_factor;
@@ -12802,6 +14186,9 @@ struct llama_context * llama_new_context_with_model(
12802
14186
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
12803
14187
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
12804
14188
 
14189
+ // this is necessary due to kv_self.n being padded later during inference
14190
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
14191
+
12805
14192
  // with causal attention, the batch size is limited by the context size
12806
14193
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
12807
14194
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
@@ -12881,32 +14268,43 @@ struct llama_context * llama_new_context_with_model(
12881
14268
  }
12882
14269
  ctx->backends.push_back(ctx->backend_metal);
12883
14270
  }
12884
- #elif defined(GGML_USE_CUBLAS)
12885
- if (model->n_gpu_layers > 0) {
14271
+ #elif defined(GGML_USE_CUDA)
14272
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12886
14273
  // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12887
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12888
- ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
14274
+ ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
14275
+ if (backend == nullptr) {
14276
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
14277
+ llama_free(ctx);
14278
+ return nullptr;
14279
+ }
14280
+ ctx->backends.push_back(backend);
14281
+ } else {
14282
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
14283
+ for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
14284
+ ggml_backend_t backend = ggml_backend_cuda_init(device);
12889
14285
  if (backend == nullptr) {
12890
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
14286
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
12891
14287
  llama_free(ctx);
12892
14288
  return nullptr;
12893
14289
  }
12894
14290
  ctx->backends.push_back(backend);
12895
- } else {
12896
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
12897
- for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
12898
- ggml_backend_t backend = ggml_backend_cuda_init(device);
12899
- if (backend == nullptr) {
12900
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
12901
- llama_free(ctx);
12902
- return nullptr;
12903
- }
12904
- ctx->backends.push_back(backend);
12905
- }
12906
14291
  }
12907
14292
  }
12908
14293
  #elif defined(GGML_USE_VULKAN)
12909
- if (model->n_gpu_layers > 0) {
14294
+ if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14295
+ LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
14296
+ llama_free(ctx);
14297
+ return nullptr;
14298
+ }
14299
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
14300
+ ggml_backend_t backend = ggml_backend_vk_init(0);
14301
+ if (backend == nullptr) {
14302
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
14303
+ llama_free(ctx);
14304
+ return nullptr;
14305
+ }
14306
+ ctx->backends.push_back(backend);
14307
+ } else {
12910
14308
  for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
12911
14309
  ggml_backend_t backend = ggml_backend_vk_init(device);
12912
14310
  if (backend == nullptr) {
@@ -12918,31 +14316,28 @@ struct llama_context * llama_new_context_with_model(
12918
14316
  }
12919
14317
  }
12920
14318
  #elif defined(GGML_USE_SYCL)
12921
- if (model->n_gpu_layers > 0) {
12922
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12923
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12924
- int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
12925
- ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
14319
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
14320
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14321
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
14322
+ if (backend == nullptr) {
14323
+ int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
14324
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
14325
+ llama_free(ctx);
14326
+ return nullptr;
14327
+ }
14328
+ ctx->backends.push_back(backend);
14329
+ } else {
14330
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
14331
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
14332
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
12926
14333
  if (backend == nullptr) {
12927
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
14334
+ int id_list[GGML_SYCL_MAX_DEVICES];
14335
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
14336
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
12928
14337
  llama_free(ctx);
12929
14338
  return nullptr;
12930
14339
  }
12931
14340
  ctx->backends.push_back(backend);
12932
- } else {
12933
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
12934
- int id_list[GGML_SYCL_MAX_DEVICES];
12935
- ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12936
- for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12937
- int device_id = id_list[i];
12938
- ggml_backend_t backend = ggml_backend_sycl_init(i);
12939
- if (backend == nullptr) {
12940
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
12941
- llama_free(ctx);
12942
- return nullptr;
12943
- }
12944
- ctx->backends.push_back(backend);
12945
- }
12946
14341
  }
12947
14342
  }
12948
14343
  #elif defined(GGML_USE_KOMPUTE)
@@ -12990,25 +14385,12 @@ struct llama_context * llama_new_context_with_model(
12990
14385
 
12991
14386
  // graph outputs buffer
12992
14387
  {
12993
- // resized during inference, reserve maximum
12994
- ctx->logits_size = hparams.n_vocab*cparams.n_batch;
12995
- ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
12996
-
12997
- const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
12998
-
12999
- ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
13000
- if (ctx->buf_output == nullptr) {
13001
- LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
14388
+ // resized during inference when a batch uses more outputs
14389
+ if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
14390
+ LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
13002
14391
  llama_free(ctx);
13003
14392
  return nullptr;
13004
14393
  }
13005
- ggml_backend_buffer_clear(ctx->buf_output, 0);
13006
-
13007
-
13008
- ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
13009
- if (params.embeddings) {
13010
- ctx->embd = ctx->logits + ctx->logits_size;
13011
- }
13012
14394
 
13013
14395
  LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
13014
14396
  ggml_backend_buffer_name(ctx->buf_output),
@@ -13033,7 +14415,7 @@ struct llama_context * llama_new_context_with_model(
13033
14415
 
13034
14416
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
13035
14417
  bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
13036
- #ifndef GGML_USE_CUBLAS
14418
+ #ifndef GGML_USE_CUDA
13037
14419
  // pipeline parallelism requires support for async compute and events
13038
14420
  // currently this is only implemented in the CUDA backend
13039
14421
  pipeline_parallel = false;
@@ -13061,14 +14443,17 @@ struct llama_context * llama_new_context_with_model(
13061
14443
  ggml_backend_t backend = ctx->backends[i];
13062
14444
  ggml_backend_buffer_type_t buft = backend_buft[i];
13063
14445
  size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
13064
- LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
13065
- ggml_backend_buft_name(buft),
13066
- size / 1024.0 / 1024.0);
14446
+ if (size > 1) {
14447
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
14448
+ ggml_backend_buft_name(buft),
14449
+ size / 1024.0 / 1024.0);
14450
+ }
13067
14451
  }
13068
14452
 
13069
14453
  // note: the number of splits during measure is higher than during inference due to the kv shift
13070
14454
  int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
13071
- LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
14455
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
14456
+ LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
13072
14457
  }
13073
14458
  }
13074
14459
 
@@ -13138,10 +14523,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
13138
14523
  case LLM_ARCH_ORION:
13139
14524
  case LLM_ARCH_INTERNLM2:
13140
14525
  case LLM_ARCH_MINICPM:
14526
+ case LLM_ARCH_XVERSE:
14527
+ case LLM_ARCH_COMMAND_R:
13141
14528
  return LLAMA_ROPE_TYPE_NORM;
13142
14529
 
13143
14530
  // the pairs of head values are offset by n_rot/2
13144
14531
  case LLM_ARCH_FALCON:
14532
+ case LLM_ARCH_GROK:
13145
14533
  case LLM_ARCH_PERSIMMON:
13146
14534
  case LLM_ARCH_BERT:
13147
14535
  case LLM_ARCH_NOMIC_BERT:
@@ -13174,6 +14562,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
13174
14562
  return model->hparams.n_embd;
13175
14563
  }
13176
14564
 
14565
+ int32_t llama_n_layer(const struct llama_model * model) {
14566
+ return model->hparams.n_layer;
14567
+ }
14568
+
13177
14569
  float llama_rope_freq_scale_train(const struct llama_model * model) {
13178
14570
  return model->hparams.rope_freq_scale_train;
13179
14571
  }
@@ -13273,6 +14665,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
13273
14665
  }
13274
14666
  }
13275
14667
 
14668
+ static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
14669
+ GGML_ASSERT(cvec.tensors.empty());
14670
+ GGML_ASSERT(cvec.ctxs.empty());
14671
+ GGML_ASSERT(cvec.bufs.empty());
14672
+
14673
+ // count layer buffer types
14674
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
14675
+ for (int64_t i = 0; i < model.hparams.n_layer; i++) {
14676
+ buft_layer_count[model.buft_layer[i].buft]++;
14677
+ }
14678
+
14679
+ // allocate contexts
14680
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
14681
+ for (auto & it : buft_layer_count) {
14682
+ int n_layers = it.second;
14683
+ struct ggml_init_params params = {
14684
+ /*.mem_size =*/ n_layers * ggml_tensor_overhead(),
14685
+ /*.mem_buffer =*/ NULL,
14686
+ /*.no_alloc =*/ true,
14687
+ };
14688
+ ggml_context * ctx = ggml_init(params);
14689
+ if (!ctx) {
14690
+ LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
14691
+ return 1;
14692
+ }
14693
+ ctx_map[it.first] = ctx;
14694
+ }
14695
+
14696
+ // make tensors
14697
+ cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
14698
+ for (size_t il = 1; il < model.hparams.n_layer; il++) {
14699
+ struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
14700
+ ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
14701
+ cvec.tensors.push_back(tensor);
14702
+ }
14703
+
14704
+ // allocate tensors / buffers and zero
14705
+ for (auto it : ctx_map) {
14706
+ ggml_backend_buffer_type_t buft = it.first;
14707
+ ggml_context * ctx = it.second;
14708
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
14709
+ if (!buf) {
14710
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
14711
+ return false;
14712
+ }
14713
+ ggml_backend_buffer_clear(buf, 0);
14714
+ cvec.ctxs.push_back(ctx);
14715
+ cvec.bufs.push_back(buf);
14716
+ }
14717
+
14718
+ return true;
14719
+ }
14720
+
14721
+ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
14722
+ const llama_model & model = lctx->model;
14723
+ llama_control_vector & cvec = lctx->cvec;
14724
+
14725
+ if (data == nullptr) {
14726
+ // disable the current control vector (but leave allocated for later)
14727
+ cvec.layer_start = -1;
14728
+ cvec.layer_end = -1;
14729
+ return 0;
14730
+ }
14731
+
14732
+ if (n_embd != (int) model.hparams.n_embd) {
14733
+ LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
14734
+ return 1;
14735
+ }
14736
+
14737
+ if (cvec.tensors.empty()) {
14738
+ if (!llama_control_vector_init(cvec, model)) {
14739
+ return 1;
14740
+ }
14741
+ }
14742
+
14743
+ cvec.layer_start = il_start;
14744
+ cvec.layer_end = il_end;
14745
+
14746
+ for (size_t il = 1; il < model.hparams.n_layer; il++) {
14747
+ assert(cvec.tensors[il] != nullptr);
14748
+
14749
+ const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
14750
+ if (off + n_embd <= len) {
14751
+ ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
14752
+ }
14753
+ }
14754
+
14755
+ return 0;
14756
+ }
14757
+
13276
14758
  struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
13277
14759
  struct llama_kv_cache_view result = {
13278
14760
  /*.n_cells = */ 0,
@@ -13426,27 +14908,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
13426
14908
 
13427
14909
  // Returns the *maximum* size of the state
13428
14910
  size_t llama_get_state_size(const struct llama_context * ctx) {
14911
+ const auto & cparams = ctx->cparams;
14912
+ const auto & hparams = ctx->model.hparams;
14913
+
13429
14914
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
13430
14915
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
13431
14916
  const size_t s_rng_size = sizeof(size_t);
13432
14917
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
14918
+ const size_t s_n_outputs = sizeof(size_t);
14919
+ // assume worst case for outputs although only currently set ones are serialized
14920
+ const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
13433
14921
  const size_t s_logits_size = sizeof(size_t);
13434
- // assume worst case for logits although only currently set ones are serialized
13435
- const size_t s_logits = ctx->logits_size * sizeof(float);
14922
+ const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
13436
14923
  const size_t s_embedding_size = sizeof(size_t);
13437
- const size_t s_embedding = ctx->embd_size * sizeof(float);
14924
+ const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
13438
14925
  const size_t s_kv_buf_size = sizeof(size_t);
13439
14926
  const size_t s_kv_head = sizeof(uint32_t);
13440
14927
  const size_t s_kv_size = sizeof(uint32_t);
13441
14928
  const size_t s_kv_used = sizeof(uint32_t);
13442
14929
  const size_t s_kv = ctx->kv_self.total_size();
13443
- // TODO: assume the max is more than 1 seq_id per KV cell
13444
- const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
14930
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
13445
14931
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
13446
14932
 
13447
14933
  const size_t s_total = (
13448
14934
  + s_rng_size
13449
14935
  + s_rng
14936
+ + s_n_outputs
14937
+ + s_output_pos
13450
14938
  + s_logits_size
13451
14939
  + s_logits
13452
14940
  + s_embedding_size
@@ -13521,7 +15009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13521
15009
  std::ostringstream rng_ss;
13522
15010
  rng_ss << ctx->rng;
13523
15011
 
13524
- const std::string & rng_str = rng_ss.str();
15012
+ const std::string & rng_str = rng_ss.str();
13525
15013
  const size_t rng_size = rng_str.size();
13526
15014
 
13527
15015
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
@@ -13530,25 +15018,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13530
15018
  data_ctx->write(rng_str.data(), rng_size);
13531
15019
  }
13532
15020
 
13533
- // copy logits
15021
+ // copy outputs
13534
15022
  {
13535
- const size_t logits_size = ctx->logits_size;
15023
+ // Can't use ctx->n_outputs because it's not for the
15024
+ // entire last batch when n_ubatch is smaller than n_batch
15025
+ size_t n_outputs = 0;
13536
15026
 
13537
- data_ctx->write(&logits_size, sizeof(logits_size));
15027
+ // copy output ids
15028
+ {
15029
+ std::vector<int32_t> output_pos;
13538
15030
 
13539
- if (logits_size) {
13540
- data_ctx->write(ctx->logits, logits_size * sizeof(float));
15031
+ const size_t n_batch = ctx->cparams.n_batch;
15032
+ const auto & output_ids = ctx->output_ids;
15033
+
15034
+ output_pos.resize(ctx->output_size);
15035
+
15036
+ // build a more compact representation of the output ids
15037
+ for (size_t i = 0; i < n_batch; ++i) {
15038
+ // map an output id to a position in the batch
15039
+ int32_t pos = output_ids[i];
15040
+ if (pos >= 0) {
15041
+ if ((size_t) pos >= n_outputs) {
15042
+ n_outputs = pos + 1;
15043
+ }
15044
+ GGML_ASSERT((size_t) pos < ctx->output_size);
15045
+ output_pos[pos] = i;
15046
+ }
15047
+ }
15048
+
15049
+ data_ctx->write(&n_outputs, sizeof(n_outputs));
15050
+
15051
+ if (n_outputs) {
15052
+ data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
15053
+ }
13541
15054
  }
13542
- }
13543
15055
 
13544
- // copy embeddings
13545
- {
13546
- const size_t embeddings_size = ctx->embd_size;
15056
+ // copy logits
15057
+ {
15058
+ const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
13547
15059
 
13548
- data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15060
+ data_ctx->write(&logits_size, sizeof(logits_size));
13549
15061
 
13550
- if (embeddings_size) {
13551
- data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15062
+ if (logits_size) {
15063
+ data_ctx->write(ctx->logits, logits_size * sizeof(float));
15064
+ }
15065
+ }
15066
+
15067
+ // copy embeddings
15068
+ {
15069
+ const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
15070
+
15071
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15072
+
15073
+ if (embeddings_size) {
15074
+ data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15075
+ }
13552
15076
  }
13553
15077
  }
13554
15078
 
@@ -13561,9 +15085,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13561
15085
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13562
15086
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
13563
15087
 
13564
- const size_t kv_buf_size = kv_self.total_size();
15088
+ // NOTE: kv_size and kv_buf_size are mostly used for sanity checks
13565
15089
  const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
13566
15090
  const uint32_t kv_size = kv_self.size;
15091
+ const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
13567
15092
  const uint32_t kv_used = kv_self.used;
13568
15093
 
13569
15094
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
@@ -13572,6 +15097,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13572
15097
  data_ctx->write(&kv_used, sizeof(kv_used));
13573
15098
 
13574
15099
  if (kv_buf_size) {
15100
+ const size_t pre_kv_buf_size = data_ctx->get_size_written();
15101
+
13575
15102
  std::vector<uint8_t> tmp_buf;
13576
15103
  for (int il = 0; il < (int) n_layer; ++il) {
13577
15104
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -13601,6 +15128,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13601
15128
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
13602
15129
  }
13603
15130
  }
15131
+ GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
13604
15132
  }
13605
15133
 
13606
15134
  for (uint32_t i = 0; i < kv_head; ++i) {
@@ -13645,6 +15173,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13645
15173
  GGML_ASSERT(!rng_ss.fail());
13646
15174
  }
13647
15175
 
15176
+ // set output ids
15177
+ {
15178
+ size_t n_outputs;
15179
+ std::vector<int32_t> output_pos;
15180
+
15181
+ memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
15182
+
15183
+ GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
15184
+
15185
+ if (n_outputs) {
15186
+ output_pos.resize(n_outputs);
15187
+ memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
15188
+ inp += n_outputs * sizeof(int32_t);
15189
+
15190
+ for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
15191
+ int32_t id = output_pos[i];
15192
+ GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15193
+ ctx->output_ids[id] = i;
15194
+ }
15195
+ }
15196
+ }
15197
+
13648
15198
  // set logits
13649
15199
  {
13650
15200
  size_t logits_size;
@@ -13665,7 +15215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13665
15215
 
13666
15216
  memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
13667
15217
 
13668
- GGML_ASSERT(ctx->embd_size == embeddings_size);
15218
+ GGML_ASSERT(ctx->embd_size >= embeddings_size);
13669
15219
 
13670
15220
  if (embeddings_size) {
13671
15221
  memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
@@ -13692,8 +15242,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13692
15242
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
13693
15243
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
13694
15244
 
15245
+ if (kv_self.size != kv_size) {
15246
+ // the KV cache needs to be big enough to load all the KV cells from the saved state
15247
+ GGML_ASSERT(kv_self.size >= kv_head);
15248
+
15249
+ LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
15250
+ __func__, kv_head, kv_size, kv_self.size);
15251
+ }
15252
+
13695
15253
  if (kv_buf_size) {
13696
- GGML_ASSERT(kv_self.total_size() == kv_buf_size);
15254
+ const size_t pre_kv_buf_size = inp - src;
15255
+
15256
+ GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
13697
15257
 
13698
15258
  for (int il = 0; il < (int) n_layer; ++il) {
13699
15259
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -13713,23 +15273,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13713
15273
 
13714
15274
  // v is not contiguous, copy row by row
13715
15275
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
13716
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
15276
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
13717
15277
 
13718
15278
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
13719
15279
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
13720
15280
  inp += v_row_size;
13721
15281
  }
13722
15282
  }
15283
+ GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
13723
15284
  }
13724
15285
 
13725
- GGML_ASSERT(kv_self.size == kv_size);
15286
+ llama_kv_cache_clear(ctx);
13726
15287
 
13727
15288
  ctx->kv_self.head = kv_head;
13728
- ctx->kv_self.size = kv_size;
13729
15289
  ctx->kv_self.used = kv_used;
13730
15290
 
13731
- ctx->kv_self.cells.resize(kv_size);
13732
-
13733
15291
  for (uint32_t i = 0; i < kv_head; ++i) {
13734
15292
  llama_pos pos;
13735
15293
  size_t seq_id_size;
@@ -13746,11 +15304,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13746
15304
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
13747
15305
  }
13748
15306
  }
13749
-
13750
- for (uint32_t i = kv_head; i < kv_size; ++i) {
13751
- ctx->kv_self.cells[i].pos = -1;
13752
- ctx->kv_self.cells[i].seq_id.clear();
13753
- }
13754
15307
  }
13755
15308
 
13756
15309
  const size_t nread = inp - src;
@@ -13956,11 +15509,33 @@ float * llama_get_logits(struct llama_context * ctx) {
13956
15509
  }
13957
15510
 
13958
15511
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
13959
- assert(ctx->logits_valid.at(i));
13960
-
13961
15512
  llama_synchronize(ctx);
13962
15513
 
13963
- return ctx->logits + i*ctx->model.hparams.n_vocab;
15514
+ try {
15515
+ if (ctx->logits == nullptr) {
15516
+ throw std::runtime_error("no logits");
15517
+ }
15518
+ if ((size_t) i >= ctx->output_ids.size()) {
15519
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
15520
+ }
15521
+ const int32_t j = ctx->output_ids[i];
15522
+
15523
+ if (j < 0) {
15524
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
15525
+ }
15526
+ if ((size_t) j >= ctx->output_size) {
15527
+ // This should not happen
15528
+ throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
15529
+ }
15530
+
15531
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
15532
+ } catch (const std::exception & err) {
15533
+ LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
15534
+ #ifndef NDEBUG
15535
+ GGML_ASSERT(false);
15536
+ #endif
15537
+ return nullptr;
15538
+ }
13964
15539
  }
13965
15540
 
13966
15541
  float * llama_get_embeddings(struct llama_context * ctx) {
@@ -13972,7 +15547,31 @@ float * llama_get_embeddings(struct llama_context * ctx) {
13972
15547
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
13973
15548
  llama_synchronize(ctx);
13974
15549
 
13975
- return ctx->embd + i*ctx->model.hparams.n_embd;
15550
+ try {
15551
+ if (ctx->embd == nullptr) {
15552
+ throw std::runtime_error("no embeddings");
15553
+ }
15554
+ if ((size_t) i >= ctx->output_ids.size()) {
15555
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
15556
+ }
15557
+ const int32_t j = ctx->output_ids[i];
15558
+
15559
+ if (j < 0) {
15560
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
15561
+ }
15562
+ if ((size_t) j >= ctx->output_size) {
15563
+ // This should not happen
15564
+ throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
15565
+ }
15566
+
15567
+ return ctx->embd + j*ctx->model.hparams.n_embd;
15568
+ } catch (const std::exception & err) {
15569
+ LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
15570
+ #ifndef NDEBUG
15571
+ GGML_ASSERT(false);
15572
+ #endif
15573
+ return nullptr;
15574
+ }
13976
15575
  }
13977
15576
 
13978
15577
  float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
@@ -14262,6 +15861,55 @@ static int32_t llama_chat_apply_template_internal(
14262
15861
  ss << message->content << "</s>";
14263
15862
  }
14264
15863
  }
15864
+ } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
15865
+ // openchat/openchat-3.5-0106,
15866
+ for (auto message : chat) {
15867
+ std::string role(message->role);
15868
+ if (role == "system") {
15869
+ ss << message->content << "<|end_of_turn|>";
15870
+ } else {
15871
+ role[0] = toupper(role[0]);
15872
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
15873
+ }
15874
+ }
15875
+ if (add_ass) {
15876
+ ss << "GPT4 Correct Assistant:";
15877
+ }
15878
+ } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
15879
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
15880
+ for (auto message : chat) {
15881
+ std::string role(message->role);
15882
+ if (role == "system") {
15883
+ // Orca-Vicuna variant uses a system prefix
15884
+ if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
15885
+ ss << "SYSTEM: " << message->content << "\n";
15886
+ } else {
15887
+ ss << message->content << "\n\n";
15888
+ }
15889
+ } else if (role == "user") {
15890
+ ss << "USER: " << message->content << "\n";
15891
+ } else if (role == "assistant") {
15892
+ ss << "ASSISTANT: " << message->content << "</s>\n";
15893
+ }
15894
+ }
15895
+ if (add_ass) {
15896
+ ss << "ASSISTANT:";
15897
+ }
15898
+ } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
15899
+ // deepseek-ai/deepseek-coder-33b-instruct
15900
+ for (auto message : chat) {
15901
+ std::string role(message->role);
15902
+ if (role == "system") {
15903
+ ss << message->content;
15904
+ } else if (role == "user") {
15905
+ ss << "### Instruction:\n" << message->content << "\n";
15906
+ } else if (role == "assistant") {
15907
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
15908
+ }
15909
+ }
15910
+ if (add_ass) {
15911
+ ss << "### Response:\n";
15912
+ }
14265
15913
  } else {
14266
15914
  // template not supported
14267
15915
  return -1;
@@ -14311,6 +15959,30 @@ LLAMA_API int32_t llama_chat_apply_template(
14311
15959
  return res;
14312
15960
  }
14313
15961
 
15962
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
15963
+ static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
15964
+ if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
15965
+ return strlen(split_path);
15966
+ }
15967
+ return 0;
15968
+ }
15969
+
15970
+ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
15971
+ std::string str_split_path(split_path);
15972
+ char postfix[32];
15973
+ snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
15974
+ std::string str_postfix(postfix);
15975
+
15976
+ // check if dest ends with postfix
15977
+ int size_prefix = str_split_path.size() - str_postfix.size();
15978
+ if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
15979
+ snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
15980
+ return size_prefix;
15981
+ }
15982
+
15983
+ return 0;
15984
+ }
15985
+
14314
15986
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
14315
15987
  struct llama_timings result = {
14316
15988
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,