llama_cpp 0.14.2 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,7 +7,7 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
- #ifdef GGML_USE_CUBLAS
10
+ #ifdef GGML_USE_CUDA
11
11
  # include "ggml-cuda.h"
12
12
  #elif defined(GGML_USE_CLBLAST)
13
13
  # include "ggml-opencl.h"
@@ -52,12 +52,16 @@
52
52
  #define NOMINMAX
53
53
  #endif
54
54
  #include <windows.h>
55
+ #ifndef PATH_MAX
56
+ #define PATH_MAX MAX_PATH
57
+ #endif
55
58
  #include <io.h>
56
59
  #endif
57
60
 
58
61
  #include <algorithm>
59
62
  #include <array>
60
63
  #include <cassert>
64
+ #include <cctype>
61
65
  #include <cfloat>
62
66
  #include <cinttypes>
63
67
  #include <climits>
@@ -68,7 +72,6 @@
68
72
  #include <cstdio>
69
73
  #include <cstring>
70
74
  #include <ctime>
71
- #include <cwctype>
72
75
  #include <forward_list>
73
76
  #include <fstream>
74
77
  #include <functional>
@@ -192,6 +195,7 @@ enum llm_arch {
192
195
  LLM_ARCH_LLAMA,
193
196
  LLM_ARCH_FALCON,
194
197
  LLM_ARCH_BAICHUAN,
198
+ LLM_ARCH_GROK,
195
199
  LLM_ARCH_GPT2,
196
200
  LLM_ARCH_GPTJ,
197
201
  LLM_ARCH_GPTNEOX,
@@ -214,12 +218,15 @@ enum llm_arch {
214
218
  LLM_ARCH_GEMMA,
215
219
  LLM_ARCH_STARCODER2,
216
220
  LLM_ARCH_MAMBA,
221
+ LLM_ARCH_XVERSE,
222
+ LLM_ARCH_COMMAND_R,
217
223
  LLM_ARCH_UNKNOWN,
218
224
  };
219
225
 
220
226
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
221
227
  { LLM_ARCH_LLAMA, "llama" },
222
228
  { LLM_ARCH_FALCON, "falcon" },
229
+ { LLM_ARCH_GROK, "grok" },
223
230
  { LLM_ARCH_GPT2, "gpt2" },
224
231
  { LLM_ARCH_GPTJ, "gptj" },
225
232
  { LLM_ARCH_GPTNEOX, "gptneox" },
@@ -243,6 +250,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
243
250
  { LLM_ARCH_GEMMA, "gemma" },
244
251
  { LLM_ARCH_STARCODER2, "starcoder2" },
245
252
  { LLM_ARCH_MAMBA, "mamba" },
253
+ { LLM_ARCH_XVERSE, "xverse" },
254
+ { LLM_ARCH_COMMAND_R, "command-r" },
246
255
  { LLM_ARCH_UNKNOWN, "(unknown)" },
247
256
  };
248
257
 
@@ -268,6 +277,7 @@ enum llm_kv {
268
277
  LLM_KV_EXPERT_COUNT,
269
278
  LLM_KV_EXPERT_USED_COUNT,
270
279
  LLM_KV_POOLING_TYPE,
280
+ LLM_KV_LOGIT_SCALE,
271
281
 
272
282
  LLM_KV_ATTENTION_HEAD_COUNT,
273
283
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -287,6 +297,10 @@ enum llm_kv {
287
297
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
288
298
  LLM_KV_ROPE_SCALING_FINETUNED,
289
299
 
300
+ LLM_KV_SPLIT_NO,
301
+ LLM_KV_SPLIT_COUNT,
302
+ LLM_KV_SPLIT_TENSORS_COUNT,
303
+
290
304
  LLM_KV_SSM_INNER_SIZE,
291
305
  LLM_KV_SSM_CONV_KERNEL,
292
306
  LLM_KV_SSM_STATE_SIZE,
@@ -332,6 +346,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
332
346
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
333
347
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
334
348
  { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
349
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
335
350
 
336
351
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
337
352
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -351,6 +366,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
351
366
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
352
367
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
353
368
 
369
+ { LLM_KV_SPLIT_NO, "split.no" },
370
+ { LLM_KV_SPLIT_COUNT, "split.count" },
371
+ { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
372
+
354
373
  { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
355
374
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
356
375
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
@@ -407,9 +426,12 @@ enum llm_tensor {
407
426
  LLM_TENSOR_FFN_DOWN,
408
427
  LLM_TENSOR_FFN_UP,
409
428
  LLM_TENSOR_FFN_ACT,
410
- LLM_TENSOR_FFN_DOWN_EXP,
429
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
411
430
  LLM_TENSOR_FFN_GATE_EXP,
412
431
  LLM_TENSOR_FFN_UP_EXP,
432
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
433
+ LLM_TENSOR_FFN_GATE_EXPS,
434
+ LLM_TENSOR_FFN_UP_EXPS,
413
435
  LLM_TENSOR_ATTN_Q_NORM,
414
436
  LLM_TENSOR_ATTN_K_NORM,
415
437
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -444,6 +466,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
444
466
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
445
467
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
446
468
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
469
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
470
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
471
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
447
472
  },
448
473
  },
449
474
  {
@@ -479,6 +504,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
479
504
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
480
505
  },
481
506
  },
507
+ {
508
+ LLM_ARCH_GROK,
509
+ {
510
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
511
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
512
+ { LLM_TENSOR_OUTPUT, "output" },
513
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
514
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
515
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
516
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
517
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
518
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
519
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
520
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
521
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
522
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
523
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
524
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
525
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
526
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
527
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
528
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
529
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
530
+ },
531
+ },
482
532
  {
483
533
  LLM_ARCH_GPT2,
484
534
  {
@@ -536,6 +586,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
536
586
  {
537
587
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
538
588
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
589
+ { LLM_TENSOR_OUTPUT, "output"},
539
590
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
540
591
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
541
592
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@@ -543,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
543
594
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
544
595
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
545
596
  { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
597
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
598
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
599
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
546
600
  },
547
601
  },
548
602
  {
@@ -838,6 +892,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
838
892
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
839
893
  },
840
894
  },
895
+ {
896
+ LLM_ARCH_XVERSE,
897
+ {
898
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
899
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
900
+ { LLM_TENSOR_OUTPUT, "output" },
901
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
902
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
903
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
904
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
905
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
906
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
907
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
908
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
909
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
910
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
911
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
912
+ },
913
+ },
914
+ {
915
+ LLM_ARCH_COMMAND_R,
916
+ {
917
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
918
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
919
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
920
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
921
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
922
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
923
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
924
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
925
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
926
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
927
+ },
928
+ },
841
929
  {
842
930
  LLM_ARCH_UNKNOWN,
843
931
  {
@@ -1010,7 +1098,7 @@ struct llama_file {
1010
1098
  size_t size;
1011
1099
 
1012
1100
  llama_file(const char * fname, const char * mode) {
1013
- fp = std::fopen(fname, mode);
1101
+ fp = ggml_fopen(fname, mode);
1014
1102
  if (fp == NULL) {
1015
1103
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1016
1104
  }
@@ -1079,6 +1167,7 @@ struct llama_file {
1079
1167
  }
1080
1168
  }
1081
1169
  };
1170
+ using llama_files = std::vector<std::unique_ptr<llama_file>>;
1082
1171
 
1083
1172
  struct llama_mmap {
1084
1173
  void * addr;
@@ -1279,6 +1368,7 @@ struct llama_mmap {
1279
1368
  }
1280
1369
  #endif
1281
1370
  };
1371
+ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
1282
1372
 
1283
1373
  // Represents some region of memory being locked using mlock or VirtualLock;
1284
1374
  // will automatically unlock on destruction.
@@ -1428,6 +1518,7 @@ struct llama_mlock {
1428
1518
  static void raw_unlock(const void * addr, size_t len) {}
1429
1519
  #endif
1430
1520
  };
1521
+ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1431
1522
 
1432
1523
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1433
1524
  std::vector<char> result(8, 0);
@@ -1447,7 +1538,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1447
1538
  static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
1448
1539
  ggml_backend_buffer_type_t buft = nullptr;
1449
1540
 
1450
- #if defined(GGML_USE_CUBLAS)
1541
+ #if defined(GGML_USE_CUDA)
1451
1542
  // host buffers should only be used when data is expected to be copied to/from the GPU
1452
1543
  if (host_buffer) {
1453
1544
  buft = ggml_backend_cuda_host_buffer_type();
@@ -1477,7 +1568,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1477
1568
 
1478
1569
  #ifdef GGML_USE_METAL
1479
1570
  buft = ggml_backend_metal_buffer_type();
1480
- #elif defined(GGML_USE_CUBLAS)
1571
+ #elif defined(GGML_USE_CUDA)
1481
1572
  buft = ggml_backend_cuda_buffer_type(gpu);
1482
1573
  #elif defined(GGML_USE_VULKAN)
1483
1574
  buft = ggml_backend_vk_buffer_type(gpu);
@@ -1503,7 +1594,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1503
1594
  static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1504
1595
  ggml_backend_buffer_type_t buft = nullptr;
1505
1596
 
1506
- #ifdef GGML_USE_CUBLAS
1597
+ #ifdef GGML_USE_CUDA
1507
1598
  if (ggml_backend_cuda_get_device_count() > 1) {
1508
1599
  buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1509
1600
  }
@@ -1524,7 +1615,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1524
1615
  }
1525
1616
 
1526
1617
  static size_t llama_get_device_count() {
1527
- #if defined(GGML_USE_CUBLAS)
1618
+ #if defined(GGML_USE_CUDA)
1528
1619
  return ggml_backend_cuda_get_device_count();
1529
1620
  #elif defined(GGML_USE_SYCL)
1530
1621
  return ggml_backend_sycl_get_device_count();
@@ -1536,7 +1627,7 @@ static size_t llama_get_device_count() {
1536
1627
  }
1537
1628
 
1538
1629
  static size_t llama_get_device_memory(int device) {
1539
- #if defined(GGML_USE_CUBLAS)
1630
+ #if defined(GGML_USE_CUDA)
1540
1631
  size_t total;
1541
1632
  size_t free;
1542
1633
  ggml_backend_cuda_get_device_memory(device, &total, &free);
@@ -1597,9 +1688,11 @@ enum e_model {
1597
1688
  MODEL_20B,
1598
1689
  MODEL_30B,
1599
1690
  MODEL_34B,
1691
+ MODEL_35B,
1600
1692
  MODEL_40B,
1601
1693
  MODEL_65B,
1602
1694
  MODEL_70B,
1695
+ MODEL_314B,
1603
1696
  MODEL_SMALL,
1604
1697
  MODEL_MEDIUM,
1605
1698
  MODEL_LARGE,
@@ -1643,6 +1736,7 @@ struct llama_hparams {
1643
1736
 
1644
1737
  float f_clamp_kqv = 0.0f;
1645
1738
  float f_max_alibi_bias = 0.0f;
1739
+ float f_logit_scale = 0.0f;
1646
1740
 
1647
1741
  bool causal_attn = true;
1648
1742
  bool need_kq_pos = false;
@@ -1716,6 +1810,7 @@ struct llama_cparams {
1716
1810
  uint32_t n_ctx; // context size used during inference
1717
1811
  uint32_t n_batch;
1718
1812
  uint32_t n_ubatch;
1813
+ uint32_t n_seq_max;
1719
1814
  uint32_t n_threads; // number of threads to use for generation
1720
1815
  uint32_t n_threads_batch; // number of threads to use for batch processing
1721
1816
 
@@ -1781,9 +1876,9 @@ struct llama_layer {
1781
1876
 
1782
1877
  // ff MoE
1783
1878
  struct ggml_tensor * ffn_gate_inp;
1784
- struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1785
- struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1786
- struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1879
+ struct ggml_tensor * ffn_gate_exps;
1880
+ struct ggml_tensor * ffn_down_exps;
1881
+ struct ggml_tensor * ffn_up_exps ;
1787
1882
 
1788
1883
  // ff bias
1789
1884
  struct ggml_tensor * ffn_down_b; // b2
@@ -1873,6 +1968,31 @@ struct llama_kv_cache {
1873
1968
  }
1874
1969
  };
1875
1970
 
1971
+ struct llama_control_vector {
1972
+ std::vector<struct ggml_tensor *> tensors; // per layer
1973
+ std::vector<struct ggml_context *> ctxs;
1974
+ std::vector<ggml_backend_buffer_t> bufs;
1975
+
1976
+ int32_t layer_start = -1;
1977
+ int32_t layer_end = -1;
1978
+
1979
+ ggml_tensor * tensor_for(int il) const {
1980
+ if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
1981
+ return nullptr;
1982
+ }
1983
+ return tensors[il];
1984
+ }
1985
+
1986
+ ~llama_control_vector() {
1987
+ for (struct ggml_context * ctx : ctxs) {
1988
+ ggml_free(ctx);
1989
+ }
1990
+ for (ggml_backend_buffer_t buf : bufs) {
1991
+ ggml_backend_buffer_free(buf);
1992
+ }
1993
+ }
1994
+ };
1995
+
1876
1996
  struct llama_vocab {
1877
1997
  using id = int32_t;
1878
1998
  using token = std::string;
@@ -1976,12 +2096,12 @@ struct llama_model {
1976
2096
  // the model memory buffers for the tensor data
1977
2097
  std::vector<ggml_backend_buffer_t> bufs;
1978
2098
 
1979
- // model memory mapped file
1980
- std::unique_ptr<llama_mmap> mapping;
2099
+ // model memory mapped files
2100
+ llama_mmaps mappings;
1981
2101
 
1982
2102
  // objects representing data potentially being locked in memory
1983
- std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
1984
- llama_mlock mlock_mmap;
2103
+ llama_mlocks mlock_bufs;
2104
+ llama_mlocks mlock_mmaps;
1985
2105
 
1986
2106
  // for quantize-stats only
1987
2107
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -1994,6 +2114,11 @@ struct llama_model {
1994
2114
  ggml_free(ctx);
1995
2115
  }
1996
2116
  for (ggml_backend_buffer_t buf : bufs) {
2117
+ #ifdef GGML_USE_CUDA
2118
+ if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
2119
+ ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
2120
+ }
2121
+ #endif
1997
2122
  ggml_backend_buffer_free(buf);
1998
2123
  }
1999
2124
  }
@@ -2008,10 +2133,6 @@ struct llama_context {
2008
2133
  ggml_backend_free(backend);
2009
2134
  }
2010
2135
 
2011
- #ifdef GGML_USE_VULKAN
2012
- ggml_vk_free_cpu_assist();
2013
- #endif
2014
-
2015
2136
  ggml_backend_buffer_free(buf_output);
2016
2137
  }
2017
2138
 
@@ -2048,20 +2169,20 @@ struct llama_context {
2048
2169
  // host buffer for the model output (logits and embeddings)
2049
2170
  ggml_backend_buffer_t buf_output = nullptr;
2050
2171
 
2051
- // decode output (2-dimensional array: [n_tokens][n_vocab])
2052
- size_t logits_size = 0;
2053
- float * logits = nullptr;
2172
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
2173
+ size_t logits_size = 0; // capacity (of floats) for logits
2174
+ float * logits = nullptr;
2175
+
2176
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2177
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2178
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
2054
2179
 
2055
- #ifndef NDEBUG
2056
- // guard against access to unset logits
2057
- std::vector<bool> logits_valid;
2058
- #endif
2059
2180
  bool logits_all = false;
2060
2181
 
2061
- // embeddings output (2-dimensional array: [n_tokens][n_embd])
2182
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
2062
2183
  // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
2063
- size_t embd_size = 0;
2064
- float * embd = nullptr;
2184
+ size_t embd_size = 0; // capacity (of floats) for embeddings
2185
+ float * embd = nullptr;
2065
2186
 
2066
2187
  // sequence embeddings output (map of [n_embd] vectors)
2067
2188
  // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
@@ -2078,14 +2199,18 @@ struct llama_context {
2078
2199
  struct ggml_tensor * inp_tokens; // I32 [n_batch]
2079
2200
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
2080
2201
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2202
+ struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2081
2203
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2082
- struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
2204
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2083
2205
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2084
2206
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2085
2207
  struct ggml_tensor * inp_cls; // I32 [n_batch]
2086
2208
  struct ggml_tensor * inp_s_copy; // I32 [kv_size]
2087
- struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
2088
- struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
2209
+ struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
2210
+ struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
2211
+
2212
+ // control vectors
2213
+ struct llama_control_vector cvec;
2089
2214
 
2090
2215
  #ifdef GGML_USE_MPI
2091
2216
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2737,6 +2862,8 @@ namespace GGUFMeta {
2737
2862
  };
2738
2863
  }
2739
2864
 
2865
+ using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
2866
+
2740
2867
  struct llama_model_loader {
2741
2868
  int n_kv = 0;
2742
2869
  int n_tensors = 0;
@@ -2747,54 +2874,133 @@ struct llama_model_loader {
2747
2874
 
2748
2875
  bool use_mmap = false;
2749
2876
 
2750
- llama_file file;
2877
+ llama_files files;
2751
2878
  llama_ftype ftype;
2752
2879
  llama_fver fver;
2753
2880
 
2754
- std::unique_ptr<llama_mmap> mapping;
2881
+ llama_mmaps mappings;
2882
+
2883
+ // Holds information on a model weight
2884
+ struct llama_tensor_weight {
2885
+ uint16_t idx; // source file index
2886
+ size_t offs; // tensor data offset in the original file
2887
+
2888
+ ggml_tensor * tensor;
2889
+
2890
+ llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2891
+ const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2892
+ offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
2893
+ }
2894
+ };
2895
+ std::vector<llama_tensor_weight> weights;
2896
+
2755
2897
  std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
2756
2898
 
2757
- struct gguf_context * ctx_gguf = NULL;
2758
- struct ggml_context * ctx_meta = NULL;
2899
+ struct gguf_context * meta = NULL;
2900
+ std::vector<ggml_context *> contexts;
2759
2901
 
2760
2902
  std::string arch_name;
2761
2903
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2762
2904
 
2763
- llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
2905
+ llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
2764
2906
  int trace = 0;
2765
2907
  if (getenv("LLAMA_TRACE")) {
2766
2908
  trace = atoi(getenv("LLAMA_TRACE"));
2767
2909
  }
2768
2910
 
2769
- struct gguf_init_params params = {
2770
- /*.no_alloc = */ true,
2771
- /*.ctx = */ &ctx_meta,
2772
- };
2773
-
2774
2911
  if (param_overrides_p != nullptr) {
2775
2912
  for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
2776
2913
  kv_overrides.insert({std::string(p->key), *p});
2777
2914
  }
2778
2915
  }
2779
2916
 
2780
- ctx_gguf = gguf_init_from_file(fname.c_str(), params);
2781
- if (!ctx_gguf) {
2917
+ struct ggml_context * ctx = NULL;
2918
+ struct gguf_init_params params = {
2919
+ /*.no_alloc = */ true,
2920
+ /*.ctx = */ &ctx,
2921
+ };
2922
+
2923
+ meta = gguf_init_from_file(fname.c_str(), params);
2924
+ if (!meta) {
2782
2925
  throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
2783
2926
  }
2784
2927
 
2785
2928
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2786
2929
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2787
2930
 
2788
- n_kv = gguf_get_n_kv(ctx_gguf);
2789
- n_tensors = gguf_get_n_tensors(ctx_gguf);
2931
+ // Save tensors data offset of the main file.
2932
+ // For subsidiary files, `meta` tensor data offset must not be used,
2933
+ // so we build a unified tensors index for weights.
2934
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2935
+ weights.emplace_back(0, cur->name, meta, cur);
2936
+ }
2937
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
2938
+ contexts.emplace_back(ctx);
2939
+
2940
+ uint16_t n_split = 0;
2941
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
2942
+
2943
+ // Load additional GGML contexts
2944
+ if (n_split > 1) {
2945
+ uint16_t idx = 0;
2946
+ get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
2947
+ if (idx != 0) {
2948
+ throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
2949
+ }
2950
+
2951
+ char split_prefix[PATH_MAX] = {0};
2952
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
2953
+ throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
2954
+ }
2955
+
2956
+ if (trace > 0) {
2957
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
2958
+ }
2959
+
2960
+ char split_path[PATH_MAX] = {0};
2961
+ for (idx = 1; idx < n_split; idx++) {
2962
+ llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
2963
+
2964
+ struct gguf_init_params split_params = {
2965
+ /*.no_alloc = */ true,
2966
+ /*.ctx = */ &ctx,
2967
+ };
2968
+ struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
2969
+ if (!ctx_gguf) {
2970
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
2971
+ }
2972
+
2973
+ // Save tensors data offset info of the shard.
2974
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2975
+ weights.emplace_back(idx, cur->name, ctx_gguf, cur);
2976
+ }
2977
+ files.emplace_back(new llama_file(split_path, "rb"));
2978
+ contexts.emplace_back(ctx);
2979
+
2980
+ gguf_free(ctx_gguf);
2981
+ }
2982
+
2983
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
2984
+
2985
+ // sanity check
2986
+ {
2987
+ const int n_tensors_loaded = (int) weights.size();
2988
+ if (n_tensors != n_tensors_loaded) {
2989
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
2990
+ }
2991
+ }
2790
2992
 
2791
- fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
2993
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
2994
+ }
2995
+
2996
+ n_kv = gguf_get_n_kv(meta);
2997
+ n_tensors = weights.size();
2792
2998
 
2793
- for (int i = 0; i < n_tensors; i++) {
2794
- const char * name = gguf_get_tensor_name(ctx_gguf, i);
2795
- struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
2796
- n_elements += ggml_nelements(t);
2797
- n_bytes += ggml_nbytes(t);
2999
+ fver = (enum llama_fver) gguf_get_version(meta);
3000
+
3001
+ for (auto & w : weights) {
3002
+ n_elements += ggml_nelements(w.tensor);
3003
+ n_bytes += ggml_nbytes(w.tensor);
2798
3004
  }
2799
3005
 
2800
3006
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -2809,7 +3015,8 @@ struct llama_model_loader {
2809
3015
  enum ggml_type type_max = GGML_TYPE_F32;
2810
3016
 
2811
3017
  for (int i = 0; i < n_tensors; i++) {
2812
- enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
3018
+ const ggml_tensor * tensor = weights.at(i).tensor;
3019
+ enum ggml_type type = tensor->type;
2813
3020
 
2814
3021
  n_type[type]++;
2815
3022
 
@@ -2819,8 +3026,8 @@ struct llama_model_loader {
2819
3026
  }
2820
3027
 
2821
3028
  if (trace > 0) {
2822
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2823
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
3029
+ const uint16_t sid = weights.at(i).idx;
3030
+ LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
2824
3031
  }
2825
3032
  }
2826
3033
 
@@ -2842,6 +3049,7 @@ struct llama_model_loader {
2842
3049
  case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
2843
3050
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2844
3051
  case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
3052
+ case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
2845
3053
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
2846
3054
  case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
2847
3055
  case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
@@ -2856,22 +3064,23 @@ struct llama_model_loader {
2856
3064
  ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
2857
3065
 
2858
3066
  {
2859
- const int kid = gguf_find_key(ctx_gguf, "general.file_type");
3067
+ const int kid = gguf_find_key(meta, "general.file_type");
2860
3068
  if (kid >= 0) {
2861
- ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
3069
+ ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
2862
3070
  }
2863
3071
  }
2864
3072
 
2865
3073
  LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
3074
+
2866
3075
  for (int i = 0; i < n_kv; i++) {
2867
- const char * name = gguf_get_key(ctx_gguf, i);
2868
- const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
3076
+ const char * name = gguf_get_key(meta, i);
3077
+ const enum gguf_type type = gguf_get_kv_type(meta, i);
2869
3078
  const std::string type_name =
2870
3079
  type == GGUF_TYPE_ARRAY
2871
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
3080
+ ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
2872
3081
  : gguf_type_name(type);
2873
3082
 
2874
- std::string value = gguf_kv_to_str(ctx_gguf, i);
3083
+ std::string value = gguf_kv_to_str(meta, i);
2875
3084
  const size_t MAX_VALUE_LEN = 40;
2876
3085
  if (value.size() > MAX_VALUE_LEN) {
2877
3086
  value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -2900,18 +3109,18 @@ struct llama_model_loader {
2900
3109
  }
2901
3110
 
2902
3111
  ~llama_model_loader() {
2903
- if (ctx_gguf) {
2904
- gguf_free(ctx_gguf);
3112
+ if (meta) {
3113
+ gguf_free(meta);
2905
3114
  }
2906
- if (ctx_meta) {
2907
- ggml_free(ctx_meta);
3115
+ for (auto * ctx : contexts) {
3116
+ ggml_free(ctx);
2908
3117
  }
2909
3118
  }
2910
3119
 
2911
3120
  template<typename T>
2912
3121
  typename std::enable_if<std::is_integral<T>::value, bool>::type
2913
3122
  get_arr_n(const std::string & key, T & result, const bool required = true) {
2914
- const int kid = gguf_find_key(ctx_gguf, key.c_str());
3123
+ const int kid = gguf_find_key(meta, key.c_str());
2915
3124
 
2916
3125
  if (kid < 0) {
2917
3126
  if (required) {
@@ -2921,7 +3130,7 @@ struct llama_model_loader {
2921
3130
  }
2922
3131
 
2923
3132
  struct GGUFMeta::ArrayInfo arr_info =
2924
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
3133
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
2925
3134
 
2926
3135
 
2927
3136
  result = arr_info.length;
@@ -2941,7 +3150,7 @@ struct llama_model_loader {
2941
3150
  const struct llama_model_kv_override * override =
2942
3151
  it != kv_overrides.end() ? &it->second : nullptr;
2943
3152
 
2944
- const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
3153
+ const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
2945
3154
 
2946
3155
  if (required && !found) {
2947
3156
  throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -2964,28 +3173,57 @@ struct llama_model_loader {
2964
3173
  }
2965
3174
 
2966
3175
  const char * get_tensor_name(int i) const {
2967
- return gguf_get_tensor_name(ctx_gguf, i);
3176
+ return weights.at(i).tensor->name;
3177
+ }
3178
+
3179
+ const llama_tensor_weight * get_weight(const char * name) const {
3180
+ for (const auto & weight : weights) {
3181
+ if (strcmp(name, weight.tensor->name) == 0) {
3182
+ return &weight;
3183
+ }
3184
+ }
3185
+ return nullptr;
3186
+ }
3187
+
3188
+ const llama_tensor_weight & require_weight(const char * name) const {
3189
+ const llama_tensor_weight * weight = get_weight(name);
3190
+ if (!weight) {
3191
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3192
+ }
3193
+ return *weight;
2968
3194
  }
2969
3195
 
2970
3196
  struct ggml_tensor * get_tensor_meta(const char * name) const {
2971
- return ggml_get_tensor(ctx_meta, name);
3197
+ const auto * weight = get_weight(name);
3198
+ if (!weight) {
3199
+ return nullptr;
3200
+ }
3201
+ return weight->tensor;
3202
+ }
3203
+
3204
+ struct ggml_tensor * require_tensor_meta(const char * name) const {
3205
+ struct ggml_tensor * tensor = get_tensor_meta(name);
3206
+ if (!tensor) {
3207
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3208
+ }
3209
+ return tensor;
2972
3210
  }
2973
3211
 
2974
3212
  struct ggml_tensor * get_tensor_meta(int i) const {
2975
3213
  return get_tensor_meta(get_tensor_name(i));
2976
3214
  }
2977
3215
 
2978
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
2979
- struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
2980
- ggml_set_name(tensor, ggml_get_name(meta));
3216
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
3217
+ struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
3218
+ ggml_set_name(tensor, ggml_get_name(cur));
2981
3219
 
2982
3220
  n_created++;
2983
3221
 
2984
3222
  return tensor;
2985
3223
  }
2986
3224
 
2987
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
2988
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
3225
+ const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
3226
+ const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
2989
3227
 
2990
3228
  if (cur == NULL) {
2991
3229
  if (!required) {
@@ -2996,8 +3234,8 @@ struct llama_model_loader {
2996
3234
 
2997
3235
  {
2998
3236
  bool is_ok = true;
2999
- for (size_t i = 0; i < ne.size(); ++i) {
3000
- if (ne[i] != cur->ne[i]) {
3237
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3238
+ if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
3001
3239
  is_ok = false;
3002
3240
  break;
3003
3241
  }
@@ -3011,127 +3249,196 @@ struct llama_model_loader {
3011
3249
  }
3012
3250
  }
3013
3251
 
3014
- return create_tensor_for(ctx, cur);
3252
+ return cur;
3015
3253
  }
3016
3254
 
3017
- void done_getting_tensors() const {
3018
- if (n_created != n_tensors) {
3019
- throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3255
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3256
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3257
+
3258
+ if (cur == NULL) {
3259
+ return NULL;
3020
3260
  }
3261
+
3262
+ return create_tensor_for(ctx, cur);
3021
3263
  }
3022
3264
 
3023
- size_t file_offset(const char * name) const {
3024
- const int idx = gguf_find_tensor(ctx_gguf, name);
3265
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
3266
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3025
3267
 
3026
- if (idx < 0) {
3027
- throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
3268
+ if (cur == NULL) {
3269
+ return NULL;
3028
3270
  }
3029
3271
 
3030
- return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
3031
- }
3272
+ if (cur->type != base->type) {
3273
+ throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
3274
+ }
3032
3275
 
3033
- void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
3034
- // prefetch the whole file - all the data is needed anyway
3035
- if (use_mmap) {
3036
- mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
3276
+ std::array<int64_t, GGML_MAX_DIMS> dims;
3277
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3278
+ dims[i] = i < ne.size() ? ne[i] : 1;
3037
3279
  }
3038
3280
 
3039
- // compute the total size of all tensors for progress reporting
3040
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
3041
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
3042
- size_data += ggml_nbytes(cur);
3281
+ struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
3282
+ dims[0], dims[1], dims[2], dims[3],
3283
+ cur->nb[1], cur->nb[2], cur->nb[3],
3284
+ offset);
3285
+
3286
+ ggml_set_name(tensor, name.c_str());
3287
+
3288
+ n_created++;
3289
+
3290
+ return tensor;
3291
+ }
3292
+
3293
+ void done_getting_tensors() const {
3294
+ if (n_created != n_tensors) {
3295
+ throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
3043
3296
  }
3297
+ }
3044
3298
 
3045
- if (use_mmap && mapping) {
3046
- if (lmlock) {
3047
- lmlock->init(mapping->addr);
3299
+ void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
3300
+ if (use_mmap) {
3301
+ mappings.reserve(files.size());
3302
+ mmaps_used.reserve(files.size());
3303
+ for (const auto & file : files) {
3304
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3305
+ mmaps_used.emplace_back(mapping->size, 0);
3306
+ if (mlock_mmaps) {
3307
+ std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3308
+ mlock_mmap->init(mapping->addr);
3309
+ mlock_mmaps->emplace_back(std::move(mlock_mmap));
3310
+ }
3311
+ mappings.emplace_back(std::move(mapping));
3048
3312
  }
3049
- mmap_used_first = mapping->size;
3313
+ }
3314
+
3315
+ // compute the total size of all tensors for progress reporting
3316
+ for (auto & w : weights) {
3317
+ size_data += ggml_nbytes(w.tensor);
3050
3318
  }
3051
3319
  }
3052
3320
 
3053
- void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
3054
- GGML_ASSERT(mapping);
3321
+ void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3322
+ GGML_ASSERT(!mappings.empty());
3323
+ const auto & mapping = mappings.at(idx);
3055
3324
 
3056
3325
  *first = mapping->size;
3057
3326
  *last = 0;
3327
+ *addr = mapping->addr;
3058
3328
  for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
3059
- const size_t offs = file_offset(ggml_get_name(tensor));
3060
- *first = std::min(*first, offs);
3061
- *last = std::max(*last, offs + ggml_nbytes(tensor));
3329
+ try {
3330
+ const auto * weight = get_weight(ggml_get_name(tensor));
3331
+ if (!weight) {
3332
+ continue;
3333
+ }
3334
+ if (weight->idx != idx) {
3335
+ continue;
3336
+ }
3337
+ *first = std::min(*first, weight->offs);
3338
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
3339
+ } catch(...) {
3340
+ // the tensor is not in the model
3341
+ }
3062
3342
  }
3063
3343
  }
3064
3344
 
3065
3345
  // for backwards compatibility, does not support ggml-backend
3066
3346
  void load_data_for(struct ggml_tensor * cur) const {
3067
- const size_t offs = file_offset(ggml_get_name(cur));
3347
+ const auto & w = require_weight(ggml_get_name(cur));
3068
3348
 
3069
- if (use_mmap && mapping) {
3349
+ if (use_mmap) {
3350
+ const auto & mapping = mappings.at(w.idx);
3070
3351
  if (cur->data == nullptr) {
3071
- cur->data = (uint8_t *)mapping->addr + offs;
3352
+ cur->data = (uint8_t *)mapping->addr + w.offs;
3072
3353
  } else {
3073
- memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
3354
+ memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
3074
3355
  }
3075
3356
  } else {
3076
3357
  GGML_ASSERT(cur->data != nullptr);
3077
- file.seek(offs, SEEK_SET);
3078
- file.read_raw(cur->data, ggml_nbytes(cur));
3358
+ GGML_ASSERT(w.idx < files.size());
3359
+ const auto & file = files.at(w.idx);
3360
+ file->seek(w.offs, SEEK_SET);
3361
+ file->read_raw(cur->data, ggml_nbytes(cur));
3079
3362
  }
3080
3363
  }
3081
3364
 
3082
3365
  size_t size_done = 0;
3083
3366
  size_t size_data = 0;
3084
- size_t mmap_used_first = -1;
3085
- size_t mmap_used_last = 0;
3367
+ std::vector<std::pair<size_t, size_t>> mmaps_used;
3086
3368
 
3087
3369
  // Returns false if cancelled by progress_callback
3088
- bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
3089
- GGML_ASSERT(size_data != 0 && "call init_mapping() first");
3370
+ bool load_all_data(
3371
+ struct ggml_context * ctx,
3372
+ llama_buf_map & bufs_mmap,
3373
+ llama_mlocks * lmlocks,
3374
+ llama_progress_callback progress_callback,
3375
+ void * progress_callback_user_data) {
3376
+ GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3090
3377
 
3091
3378
  std::vector<no_init<uint8_t>> read_buf;
3092
-
3093
3379
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3380
+ const auto * weight = get_weight(ggml_get_name(cur));
3381
+ if (weight == nullptr) {
3382
+ // this can happen with split experts models
3383
+ continue;
3384
+ }
3385
+
3094
3386
  if (progress_callback) {
3095
3387
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
3096
3388
  return false;
3097
3389
  }
3098
3390
  }
3099
3391
 
3100
- const size_t offs = file_offset(ggml_get_name(cur));
3392
+ size_t n_size = ggml_nbytes(cur);
3101
3393
 
3102
- if (use_mmap && mapping) {
3394
+ if (use_mmap) {
3395
+ const auto & mapping = mappings.at(weight->idx);
3396
+ ggml_backend_buffer_t buf_mmap = nullptr;
3397
+ if (bufs_mmap.count(weight->idx)) {
3398
+ buf_mmap = bufs_mmap.at(weight->idx);
3399
+ }
3400
+ GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3103
3401
  if (buf_mmap && cur->data == nullptr) {
3104
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
3105
- if (lmlock) {
3106
- lmlock->grow_to(offs + ggml_nbytes(cur));
3402
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3403
+ if (lmlocks) {
3404
+ const auto & lmlock = lmlocks->at(weight->idx);
3405
+ lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3107
3406
  }
3108
- mmap_used_first = std::min(mmap_used_first, offs);
3109
- mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
3407
+
3408
+ auto & mmap_used = mmaps_used[weight->idx];
3409
+ mmap_used.first = std::min(mmap_used.first, weight->offs);
3410
+ mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3110
3411
  } else {
3111
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
3412
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3112
3413
  }
3113
3414
  } else {
3415
+ GGML_ASSERT(weight->idx < files.size());
3416
+ const auto & file = files.at(weight->idx);
3114
3417
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3115
- file.seek(offs, SEEK_SET);
3116
- file.read_raw(cur->data, ggml_nbytes(cur));
3418
+ file->seek(weight->offs, SEEK_SET);
3419
+ file->read_raw(cur->data, ggml_nbytes(cur));
3117
3420
  } else {
3118
3421
  read_buf.resize(ggml_nbytes(cur));
3119
- file.seek(offs, SEEK_SET);
3120
- file.read_raw(read_buf.data(), ggml_nbytes(cur));
3121
- ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
3422
+ file->seek(weight->offs, SEEK_SET);
3423
+ file->read_raw(read_buf.data(), ggml_nbytes(cur));
3424
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3122
3425
  }
3123
3426
  }
3124
3427
 
3125
- size_done += ggml_nbytes(cur);
3428
+ size_done += n_size;
3126
3429
  }
3127
3430
 
3128
3431
  // check if this is the last call and do final cleanup
3129
3432
  if (size_done >= size_data) {
3130
3433
  // unmap offloaded tensors and metadata
3131
- if (use_mmap && mapping) {
3132
- mapping->unmap_fragment(0, mmap_used_first);
3133
- if (mmap_used_last != 0) {
3134
- mapping->unmap_fragment(mmap_used_last, mapping->size);
3434
+ if (use_mmap) {
3435
+ for (uint32_t idx = 0; idx < mappings.size(); idx++) {
3436
+ const auto & mmap_used = mmaps_used.at(idx);
3437
+ auto & mapping = mappings.at(idx);
3438
+ mapping->unmap_fragment(0, mmap_used.first);
3439
+ if (mmap_used.second != 0) {
3440
+ mapping->unmap_fragment(mmap_used.second, mapping->size);
3441
+ }
3135
3442
  }
3136
3443
  }
3137
3444
  if (progress_callback) {
@@ -3204,6 +3511,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3204
3511
  case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
3205
3512
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
3206
3513
  case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
3514
+ case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw";
3207
3515
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
3208
3516
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
3209
3517
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
@@ -3231,9 +3539,11 @@ static const char * llama_model_type_name(e_model type) {
3231
3539
  case MODEL_20B: return "20B";
3232
3540
  case MODEL_30B: return "30B";
3233
3541
  case MODEL_34B: return "34B";
3542
+ case MODEL_35B: return "35B";
3234
3543
  case MODEL_40B: return "40B";
3235
3544
  case MODEL_65B: return "65B";
3236
3545
  case MODEL_70B: return "70B";
3546
+ case MODEL_314B: return "314B";
3237
3547
  case MODEL_SMALL: return "0.1B";
3238
3548
  case MODEL_MEDIUM: return "0.4B";
3239
3549
  case MODEL_LARGE: return "0.8B";
@@ -3263,7 +3573,7 @@ static void llm_load_hparams(
3263
3573
  llama_model_loader & ml,
3264
3574
  llama_model & model) {
3265
3575
  auto & hparams = model.hparams;
3266
- const gguf_context * ctx = ml.ctx_gguf;
3576
+ const gguf_context * ctx = ml.meta;
3267
3577
 
3268
3578
  // get metadata as string
3269
3579
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -3372,6 +3682,15 @@ static void llm_load_hparams(
3372
3682
  default: model.type = e_model::MODEL_UNKNOWN;
3373
3683
  }
3374
3684
  } break;
3685
+ case LLM_ARCH_GROK:
3686
+ {
3687
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3688
+
3689
+ switch (hparams.n_layer) {
3690
+ case 64: model.type = e_model::MODEL_314B; break;
3691
+ default: model.type = e_model::MODEL_UNKNOWN;
3692
+ }
3693
+ } break;
3375
3694
  case LLM_ARCH_FALCON:
3376
3695
  {
3377
3696
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3623,6 +3942,25 @@ static void llm_load_hparams(
3623
3942
  default: model.type = e_model::MODEL_UNKNOWN;
3624
3943
  }
3625
3944
  } break;
3945
+ case LLM_ARCH_XVERSE:
3946
+ {
3947
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3948
+ switch (hparams.n_layer) {
3949
+ case 32: model.type = e_model::MODEL_7B; break;
3950
+ case 40: model.type = e_model::MODEL_13B; break;
3951
+ case 80: model.type = e_model::MODEL_65B; break;
3952
+ default: model.type = e_model::MODEL_UNKNOWN;
3953
+ }
3954
+ } break;
3955
+ case LLM_ARCH_COMMAND_R:
3956
+ {
3957
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
3958
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3959
+ switch (hparams.n_layer) {
3960
+ case 40: model.type = e_model::MODEL_35B; break;
3961
+ default: model.type = e_model::MODEL_UNKNOWN;
3962
+ }
3963
+ } break;
3626
3964
  default: (void)0;
3627
3965
  }
3628
3966
 
@@ -3644,7 +3982,7 @@ static void llm_load_vocab(
3644
3982
  llama_model & model) {
3645
3983
  auto & vocab = model.vocab;
3646
3984
 
3647
- struct gguf_context * ctx = ml.ctx_gguf;
3985
+ struct gguf_context * ctx = ml.meta;
3648
3986
 
3649
3987
  const auto kv = LLM_KV(model.arch);
3650
3988
 
@@ -3777,7 +4115,7 @@ static void llm_load_vocab(
3777
4115
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3778
4116
  vocab.linefeed_id = vocab.special_pad_id;
3779
4117
  } else {
3780
- const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
4118
+ const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
3781
4119
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
3782
4120
  vocab.linefeed_id = ids[0];
3783
4121
  }
@@ -3944,6 +4282,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3944
4282
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
3945
4283
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
3946
4284
  LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
4285
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
3947
4286
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3948
4287
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3949
4288
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
@@ -4009,6 +4348,7 @@ static bool llm_load_tensors(
4009
4348
 
4010
4349
  const int64_t n_layer = hparams.n_layer;
4011
4350
  const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
4351
+ bool use_mmap_buffer = true;
4012
4352
 
4013
4353
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
4014
4354
  model.buft_input = llama_default_buffer_type_cpu(true);
@@ -4097,6 +4437,10 @@ static bool llm_load_tensors(
4097
4437
 
4098
4438
  // create one context per buffer type
4099
4439
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4440
+
4441
+ // for moe merged tensors
4442
+ ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4443
+
4100
4444
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4101
4445
  for (auto & it : buft_layer_count) {
4102
4446
  struct ggml_init_params params = {
@@ -4123,6 +4467,11 @@ static bool llm_load_tensors(
4123
4467
  const int64_t n_vocab = hparams.n_vocab;
4124
4468
  const int64_t n_vocab_type = hparams.n_vocab_type;
4125
4469
  const int64_t n_ff = hparams.n_ff;
4470
+ const int64_t n_expert = hparams.n_expert;
4471
+
4472
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
4473
+ throw std::runtime_error("model has expert layers but no expert layers are used");
4474
+ }
4126
4475
 
4127
4476
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4128
4477
 
@@ -4177,26 +4526,113 @@ static bool llm_load_tensors(
4177
4526
 
4178
4527
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4179
4528
 
4180
- layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
4181
-
4182
- if (layer.ffn_gate_inp == nullptr) {
4183
- GGML_ASSERT(hparams.n_expert == 0);
4184
- GGML_ASSERT(hparams.n_expert_used == 0);
4185
-
4529
+ if (n_expert == 0) {
4186
4530
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4187
4531
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4188
4532
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4189
4533
  } else {
4190
- GGML_ASSERT(hparams.n_expert > 0);
4191
- GGML_ASSERT(hparams.n_expert_used > 0);
4192
-
4193
- // MoE branch
4194
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
4195
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
4196
- layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
4197
- layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
4534
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4535
+
4536
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4537
+ if (layer.ffn_gate_exps) {
4538
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4539
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4540
+ } else {
4541
+ // merge split expert into a single tensor for compatibility with older models
4542
+ // requires disabling mmap
4543
+ use_mmap_buffer = false;
4544
+
4545
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4546
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4547
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4548
+
4549
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4550
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4551
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4552
+
4553
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4554
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4555
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4556
+
4557
+ for (uint32_t x = 0; x < n_expert; ++x) {
4558
+ // the individual experts are loaded into a view of the merged tensor
4559
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4560
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4561
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4562
+ }
4563
+ }
4564
+ }
4565
+ }
4566
+ } break;
4567
+ case LLM_ARCH_GROK:
4568
+ {
4569
+ if (n_expert == 0) {
4570
+ throw std::runtime_error("Grok model cannot have zero experts");
4571
+ }
4572
+
4573
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4574
+
4575
+ // output
4576
+ {
4577
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4578
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4579
+ // if output is NULL, init from the input tok embed
4580
+ if (model.output == NULL) {
4581
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4582
+ ml.n_created--; // artificial tensor
4583
+ ml.size_data += ggml_nbytes(model.output);
4584
+ }
4585
+ }
4586
+
4587
+ for (int i = 0; i < n_layer; ++i) {
4588
+ ggml_context * ctx_layer = ctx_for_layer(i);
4589
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4590
+
4591
+ auto & layer = model.layers[i];
4592
+
4593
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4594
+
4595
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4596
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4597
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4598
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4599
+
4600
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4601
+
4602
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4603
+
4604
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4605
+
4606
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4607
+ if (layer.ffn_gate_exps) {
4608
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4609
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4610
+ } else {
4611
+ // merge split expert into a single tensor for compatibility with older models
4612
+ // requires disabling mmap
4613
+ use_mmap_buffer = false;
4614
+
4615
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4616
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4617
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4618
+
4619
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4620
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4621
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4622
+
4623
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4624
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4625
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4626
+
4627
+ for (uint32_t x = 0; x < n_expert; ++x) {
4628
+ // the individual experts are loaded into a view of the merged tensor
4629
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4630
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4631
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4198
4632
  }
4199
4633
  }
4634
+
4635
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4200
4636
  }
4201
4637
  } break;
4202
4638
  case LLM_ARCH_BAICHUAN:
@@ -4235,9 +4671,9 @@ static bool llm_load_tensors(
4235
4671
  {
4236
4672
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4237
4673
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4238
- if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
4239
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4240
- } else {
4674
+
4675
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4676
+ if (!model.output) {
4241
4677
  model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
4242
4678
  ml.n_created--; // artificial tensor
4243
4679
  ml.size_data += ggml_nbytes(model.output);
@@ -4253,10 +4689,8 @@ static bool llm_load_tensors(
4253
4689
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4254
4690
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4255
4691
 
4256
- if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
4257
- layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
4258
- layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
4259
- }
4692
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
4693
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
4260
4694
 
4261
4695
  layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4262
4696
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
@@ -4436,16 +4870,19 @@ static bool llm_load_tensors(
4436
4870
  case LLM_ARCH_MPT:
4437
4871
  {
4438
4872
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4873
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
4439
4874
 
4440
4875
  // output
4441
4876
  {
4442
4877
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4443
4878
  model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4444
4879
 
4445
- // same as tok_embd, duplicated to allow offloading
4446
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4447
- ml.n_created--; // artificial tensor
4448
- ml.size_data += ggml_nbytes(model.output);
4880
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4881
+ if (!model.output) {
4882
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
4883
+ ml.n_created--; // artificial tensor
4884
+ ml.size_data += ggml_nbytes(model.output);
4885
+ }
4449
4886
  }
4450
4887
 
4451
4888
  for (int i = 0; i < n_layer; ++i) {
@@ -4472,6 +4909,12 @@ static bool llm_load_tensors(
4472
4909
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4473
4910
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4474
4911
 
4912
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
4913
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
4914
+
4915
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
4916
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
4917
+
4475
4918
  // AWQ ScaleActivation layer
4476
4919
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
4477
4920
  }
@@ -4918,6 +5361,59 @@ static bool llm_load_tensors(
4918
5361
  layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
4919
5362
  }
4920
5363
  } break;
5364
+ case LLM_ARCH_XVERSE:
5365
+ {
5366
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5367
+ {
5368
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5369
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5370
+ }
5371
+ for (int i = 0; i < n_layer; ++i) {
5372
+ ggml_context * ctx_layer = ctx_for_layer(i);
5373
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5374
+ auto & layer = model.layers[i];
5375
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5376
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5377
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5378
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5379
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5380
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5381
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5382
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5383
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5384
+ }
5385
+ } break;
5386
+ case LLM_ARCH_COMMAND_R:
5387
+ {
5388
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5389
+
5390
+ // output
5391
+ {
5392
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5393
+ // init output from the input tok embed
5394
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5395
+ ml.n_created--; // artificial tensor
5396
+ ml.size_data += ggml_nbytes(model.output);
5397
+ }
5398
+
5399
+ for (int i = 0; i < n_layer; ++i) {
5400
+ ggml_context * ctx_layer = ctx_for_layer(i);
5401
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5402
+
5403
+ auto & layer = model.layers[i];
5404
+
5405
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5406
+
5407
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5408
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5409
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5410
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5411
+
5412
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5413
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5414
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5415
+ }
5416
+ } break;
4921
5417
  default:
4922
5418
  throw std::runtime_error("unknown architecture");
4923
5419
  }
@@ -4925,49 +5421,97 @@ static bool llm_load_tensors(
4925
5421
 
4926
5422
  ml.done_getting_tensors();
4927
5423
 
4928
- ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
5424
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
5425
+ model.mappings.reserve(ml.mappings.size());
4929
5426
 
4930
5427
  // create the backend buffers
4931
- std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
5428
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
5429
+ ctx_bufs.reserve(ctx_map.size());
5430
+
5431
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
5432
+ size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
5433
+ model.bufs.reserve(n_max_backend_buffer);
4932
5434
 
4933
5435
  for (auto & it : ctx_map) {
4934
5436
  ggml_backend_buffer_type_t buft = it.first;
4935
- ggml_context * ctx = it.second;
4936
- ggml_backend_buffer_t buf = nullptr;
5437
+ ggml_context * ctx = it.second;
5438
+
5439
+ llama_buf_map bufs;
5440
+ bufs.reserve(n_max_backend_buffer);
4937
5441
 
4938
5442
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
4939
5443
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
4940
5444
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
4941
- if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
4942
- size_t first, last;
4943
- ml.get_mapping_range(&first, &last, ctx);
4944
- buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
5445
+ if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
5446
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5447
+ void * addr = nullptr;
5448
+ size_t first, last;
5449
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5450
+ if (first >= last) {
5451
+ continue;
5452
+ }
5453
+ ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
5454
+ if (buf == nullptr) {
5455
+ throw std::runtime_error("unable to allocate backend CPU buffer");
5456
+ }
5457
+ model.bufs.push_back(buf);
5458
+ bufs.emplace(idx, buf);
5459
+ #ifdef GGML_USE_CUDA
5460
+ if (n_layer >= n_gpu_layers) {
5461
+ ggml_backend_cuda_register_host_buffer(
5462
+ ggml_backend_buffer_get_base(buf),
5463
+ ggml_backend_buffer_get_size(buf));
5464
+ }
5465
+ #endif
5466
+ }
4945
5467
  }
4946
5468
  #ifdef GGML_USE_METAL
4947
- else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
4948
- const size_t max_size = ggml_get_max_tensor_size(ctx);
4949
- size_t first, last;
4950
- ml.get_mapping_range(&first, &last, ctx);
4951
- buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
5469
+ else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
5470
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5471
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
5472
+ void * addr = nullptr;
5473
+ size_t first, last;
5474
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
5475
+ if (first >= last) {
5476
+ continue;
5477
+ }
5478
+ ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
5479
+ if (buf == nullptr) {
5480
+ throw std::runtime_error("unable to allocate backend metal buffer");
5481
+ }
5482
+ model.bufs.push_back(buf);
5483
+ bufs.emplace(idx, buf);
5484
+ }
4952
5485
  }
4953
5486
  #endif
4954
5487
  else {
4955
- buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
4956
- if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
5488
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
5489
+ if (buf == nullptr) {
5490
+ throw std::runtime_error("unable to allocate backend buffer");
5491
+ }
5492
+ model.bufs.push_back(buf);
5493
+ if (use_mlock && ggml_backend_buffer_is_host(buf)) {
4957
5494
  model.mlock_bufs.emplace_back(new llama_mlock);
4958
5495
  auto & mlock_buf = model.mlock_bufs.back();
4959
5496
  mlock_buf->init (ggml_backend_buffer_get_base(buf));
4960
5497
  mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
4961
5498
  }
5499
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5500
+ bufs.emplace(idx, buf);
5501
+ }
4962
5502
  }
4963
- if (buf == nullptr) {
5503
+
5504
+ if (bufs.empty()) {
4964
5505
  throw std::runtime_error("failed to allocate buffer");
4965
5506
  }
4966
- // indicate that this buffer contains weights
4967
- // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
4968
- ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
4969
- model.bufs.push_back(buf);
4970
- ctx_bufs.emplace_back(ctx, buf);
5507
+
5508
+ for (auto & buf : bufs) {
5509
+ // indicate that this buffer contains weights
5510
+ // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5511
+ ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5512
+ }
5513
+
5514
+ ctx_bufs.emplace_back(ctx, bufs);
4971
5515
  }
4972
5516
 
4973
5517
  if (llama_supports_gpu_offload()) {
@@ -4999,13 +5543,17 @@ static bool llm_load_tensors(
4999
5543
  // load tensor data
5000
5544
  for (auto & it : ctx_bufs) {
5001
5545
  ggml_context * ctx = it.first;
5002
- ggml_backend_buffer_t buf = it.second;
5003
- if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
5546
+ auto & bufs = it.second;
5547
+ if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
5004
5548
  return false;
5005
5549
  }
5006
5550
  }
5007
5551
 
5008
- model.mapping = std::move(ml.mapping);
5552
+ if (use_mmap_buffer) {
5553
+ for (auto & mapping : ml.mappings) {
5554
+ model.mappings.emplace_back(std::move(mapping));
5555
+ }
5556
+ }
5009
5557
 
5010
5558
  // loading time will be recalculate after the first eval, so
5011
5559
  // we take page faults deferred by mmap() into consideration
@@ -5064,6 +5612,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
5064
5612
  }
5065
5613
  #endif
5066
5614
 
5615
+ #ifdef GGML_USE_SYCL
5616
+ if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
5617
+ ggml_backend_sycl_set_single_device_mode(params.main_gpu);
5618
+ //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
5619
+ params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
5620
+ } else {
5621
+ ggml_backend_sycl_set_mul_device_mode();
5622
+ }
5623
+ #endif
5624
+
5067
5625
  if (!llm_load_tensors(
5068
5626
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
5069
5627
  params.progress_callback, params.progress_callback_user_data
@@ -5150,8 +5708,8 @@ static void llm_build_kv_store(
5150
5708
  GGML_ASSERT(kv.size == n_ctx);
5151
5709
 
5152
5710
  // compute the transposed [n_tokens, n_embd] V matrix
5153
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
5154
- //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
5711
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
5712
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
5155
5713
  cb(v_cur_t, "v_cur_t", il);
5156
5714
 
5157
5715
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
@@ -5335,6 +5893,20 @@ static struct ggml_tensor * llm_build_kqv(
5335
5893
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5336
5894
  }
5337
5895
 
5896
+ if (model.arch == LLM_ARCH_GROK) {
5897
+ // need to do the following:
5898
+ // multiply by attn_output_multiplyer of 0.08838834764831845
5899
+ // and then :
5900
+ // kq = 30 * tanh(kq / 30)
5901
+ // before the softmax below
5902
+
5903
+ //try from phi2
5904
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
5905
+
5906
+ kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
5907
+ kq = ggml_scale(ctx, kq, 30);
5908
+ }
5909
+
5338
5910
  #if defined(GGML_USE_KOMPUTE)
5339
5911
  #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
5340
5912
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
@@ -5461,7 +6033,8 @@ struct llm_build_context {
5461
6033
  const float norm_rms_eps;
5462
6034
 
5463
6035
  const int32_t n_tokens;
5464
- const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
6036
+ const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
6037
+ const int32_t n_outputs;
5465
6038
  const int32_t kv_head; // index of where we store new KV data in the cache
5466
6039
  const int32_t n_orig_ctx;
5467
6040
 
@@ -5508,6 +6081,7 @@ struct llm_build_context {
5508
6081
  norm_rms_eps (hparams.f_norm_rms_eps),
5509
6082
  n_tokens (batch.n_tokens),
5510
6083
  n_kv (worst_case ? kv_self.size : kv_self.n),
6084
+ n_outputs (worst_case ? n_tokens : lctx.n_outputs),
5511
6085
  kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
5512
6086
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5513
6087
  pooling_type (cparams.pooling_type),
@@ -5529,6 +6103,7 @@ struct llm_build_context {
5529
6103
  lctx.inp_tokens = nullptr;
5530
6104
  lctx.inp_embd = nullptr;
5531
6105
  lctx.inp_pos = nullptr;
6106
+ lctx.inp_out_ids = nullptr;
5532
6107
  lctx.inp_KQ_mask = nullptr;
5533
6108
  lctx.inp_KQ_pos = nullptr;
5534
6109
  lctx.inp_K_shift = nullptr;
@@ -5652,6 +6227,13 @@ struct llm_build_context {
5652
6227
  return lctx.inp_pos;
5653
6228
  }
5654
6229
 
6230
+ struct ggml_tensor * build_inp_out_ids() {
6231
+ lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
6232
+ cb(lctx.inp_out_ids, "inp_out_ids", -1);
6233
+ ggml_set_input(lctx.inp_out_ids);
6234
+ return lctx.inp_out_ids;
6235
+ }
6236
+
5655
6237
  struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
5656
6238
  if (causal) {
5657
6239
  lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
@@ -5708,6 +6290,9 @@ struct llm_build_context {
5708
6290
  struct ggml_cgraph * build_llama() {
5709
6291
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5710
6292
 
6293
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6294
+ int32_t n_tokens = this->n_tokens;
6295
+
5711
6296
  const int64_t n_embd_head = hparams.n_embd_head_v;
5712
6297
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5713
6298
  GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5775,6 +6360,14 @@ struct llm_build_context {
5775
6360
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5776
6361
  }
5777
6362
 
6363
+ if (il == n_layer - 1) {
6364
+ // skip computing output for unused tokens
6365
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6366
+ n_tokens = n_outputs;
6367
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6368
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6369
+ }
6370
+
5778
6371
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5779
6372
  cb(ffn_inp, "ffn_inp", il);
5780
6373
 
@@ -5827,19 +6420,19 @@ struct llm_build_context {
5827
6420
  for (int i = 0; i < n_expert_used; ++i) {
5828
6421
  ggml_tensor * cur_expert;
5829
6422
 
5830
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
6423
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
5831
6424
  cb(cur_up, "ffn_moe_up", il);
5832
6425
 
5833
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
6426
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
5834
6427
  cb(cur_gate, "ffn_moe_gate", il);
5835
6428
 
5836
6429
  cur_gate = ggml_silu(ctx0, cur_gate);
5837
6430
  cb(cur_gate, "ffn_moe_silu", il);
5838
6431
 
5839
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
6432
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
5840
6433
  cb(cur_expert, "ffn_moe_gate_par", il);
5841
6434
 
5842
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6435
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
5843
6436
  cb(cur_expert, "ffn_moe_down", il);
5844
6437
 
5845
6438
  cur_expert = ggml_mul(ctx0, cur_expert,
@@ -5858,6 +6451,12 @@ struct llm_build_context {
5858
6451
  }
5859
6452
 
5860
6453
  cur = ggml_add(ctx0, cur, ffn_inp);
6454
+ cb(cur, "ffn_out", il);
6455
+
6456
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
6457
+ if (layer_dir != nullptr) {
6458
+ cur = ggml_add(ctx0, cur, layer_dir);
6459
+ }
5861
6460
  cb(cur, "l_out", il);
5862
6461
 
5863
6462
  // input for next layer
@@ -5893,7 +6492,7 @@ struct llm_build_context {
5893
6492
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5894
6493
 
5895
6494
  // inp_pos - contains the positions
5896
- struct ggml_tensor * inp_pos = build_inp_pos();
6495
+ struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
5897
6496
 
5898
6497
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5899
6498
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -5943,12 +6542,18 @@ struct llm_build_context {
5943
6542
  cb(Qcur, "Qcur", il);
5944
6543
  cb(Kcur, "Kcur", il);
5945
6544
 
5946
-
5947
6545
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5948
6546
  model.layers[il].wo, NULL,
5949
6547
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5950
6548
  }
5951
6549
 
6550
+ if (il == n_layer - 1) {
6551
+ // skip computing output for unused tokens
6552
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6553
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6554
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6555
+ }
6556
+
5952
6557
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
5953
6558
  cb(ffn_inp, "ffn_inp", il);
5954
6559
 
@@ -5991,6 +6596,111 @@ struct llm_build_context {
5991
6596
  return gf;
5992
6597
  }
5993
6598
 
6599
+ struct ggml_cgraph * build_xverse() {
6600
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6601
+
6602
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6603
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6604
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6605
+
6606
+ struct ggml_tensor * cur;
6607
+ struct ggml_tensor * inpL;
6608
+
6609
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6610
+
6611
+ // inp_pos - contains the positions
6612
+ struct ggml_tensor * inp_pos = build_inp_pos();
6613
+
6614
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6615
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6616
+
6617
+ // positions of the tokens in the KV cache
6618
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6619
+
6620
+ for (int il = 0; il < n_layer; ++il) {
6621
+ struct ggml_tensor * inpSA = inpL;
6622
+
6623
+ cur = llm_build_norm(ctx0, inpL, hparams,
6624
+ model.layers[il].attn_norm, NULL,
6625
+ LLM_NORM_RMS, cb, il);
6626
+ cb(cur, "attn_norm", il);
6627
+
6628
+ // self-attention
6629
+ {
6630
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6631
+ cb(Qcur, "Qcur", il);
6632
+
6633
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6634
+ cb(Kcur, "Kcur", il);
6635
+
6636
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6637
+ cb(Vcur, "Vcur", il);
6638
+
6639
+ Qcur = ggml_rope_custom(
6640
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6641
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6642
+ ext_factor, attn_factor, beta_fast, beta_slow
6643
+ );
6644
+ cb(Qcur, "Qcur", il);
6645
+
6646
+ Kcur = ggml_rope_custom(
6647
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6648
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6649
+ ext_factor, attn_factor, beta_fast, beta_slow
6650
+ );
6651
+ cb(Kcur, "Kcur", il);
6652
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6653
+ model.layers[il].wo, NULL,
6654
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6655
+ }
6656
+
6657
+ if (il == n_layer - 1) {
6658
+ // skip computing output for unused tokens
6659
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6660
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6661
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6662
+ }
6663
+
6664
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6665
+ cb(ffn_inp, "ffn_inp", il);
6666
+
6667
+ // feed-forward network
6668
+ {
6669
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6670
+ model.layers[il].ffn_norm, NULL,
6671
+ LLM_NORM_RMS, cb, il);
6672
+ cb(cur, "ffn_norm", il);
6673
+
6674
+ cur = llm_build_ffn(ctx0, cur,
6675
+ model.layers[il].ffn_up, NULL,
6676
+ model.layers[il].ffn_gate, NULL,
6677
+ model.layers[il].ffn_down, NULL,
6678
+ NULL,
6679
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6680
+ cb(cur, "ffn_out", il);
6681
+ }
6682
+
6683
+ cur = ggml_add(ctx0, cur, ffn_inp);
6684
+ cb(cur, "l_out", il);
6685
+
6686
+ // input for next layer
6687
+ inpL = cur;
6688
+ }
6689
+
6690
+ cur = inpL;
6691
+
6692
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
6693
+ cb(cur, "result_norm", -1);
6694
+
6695
+ // lm_head
6696
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6697
+ cb(cur, "result_output", -1);
6698
+
6699
+ ggml_build_forward_expand(gf, cur);
6700
+
6701
+ return gf;
6702
+ }
6703
+
5994
6704
  struct ggml_cgraph * build_falcon() {
5995
6705
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5996
6706
 
@@ -6064,6 +6774,14 @@ struct llm_build_context {
6064
6774
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6065
6775
  }
6066
6776
 
6777
+ if (il == n_layer - 1) {
6778
+ // skip computing output for unused tokens
6779
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6780
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6781
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6782
+ attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
6783
+ }
6784
+
6067
6785
  struct ggml_tensor * ffn_inp = cur;
6068
6786
 
6069
6787
  // feed forward
@@ -6104,6 +6822,214 @@ struct llm_build_context {
6104
6822
  return gf;
6105
6823
  }
6106
6824
 
6825
+ struct ggml_cgraph * build_grok() {
6826
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6827
+
6828
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
6829
+ int32_t n_tokens = this->n_tokens;
6830
+
6831
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6832
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6833
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6834
+
6835
+ struct ggml_tensor * cur;
6836
+ struct ggml_tensor * inpL;
6837
+
6838
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6839
+
6840
+ // multiply by embedding_multiplier_scale of 78.38367176906169
6841
+ inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
6842
+
6843
+ // inp_pos - contains the positions
6844
+ struct ggml_tensor * inp_pos = build_inp_pos();
6845
+
6846
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6847
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6848
+
6849
+ for (int il = 0; il < n_layer; ++il) {
6850
+ struct ggml_tensor * inpSA = inpL;
6851
+
6852
+ // norm
6853
+ cur = llm_build_norm(ctx0, inpL, hparams,
6854
+ model.layers[il].attn_norm, NULL,
6855
+ LLM_NORM_RMS, cb, il);
6856
+ cb(cur, "attn_norm", il);
6857
+
6858
+
6859
+ // self-attention
6860
+ {
6861
+ // compute Q and K and RoPE them
6862
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6863
+ cb(Qcur, "Qcur", il);
6864
+ if (model.layers[il].bq) {
6865
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6866
+ cb(Qcur, "Qcur", il);
6867
+ }
6868
+
6869
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6870
+ cb(Kcur, "Kcur", il);
6871
+ if (model.layers[il].bk) {
6872
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6873
+ cb(Kcur, "Kcur", il);
6874
+ }
6875
+
6876
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6877
+ cb(Vcur, "Vcur", il);
6878
+ if (model.layers[il].bv) {
6879
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6880
+ cb(Vcur, "Vcur", il);
6881
+ }
6882
+
6883
+ Qcur = ggml_rope_custom(
6884
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6885
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6886
+ ext_factor, attn_factor, beta_fast, beta_slow
6887
+ );
6888
+ cb(Qcur, "Qcur", il);
6889
+
6890
+ Kcur = ggml_rope_custom(
6891
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6892
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6893
+ ext_factor, attn_factor, beta_fast, beta_slow
6894
+ );
6895
+ cb(Kcur, "Kcur", il);
6896
+
6897
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6898
+ model.layers[il].wo, model.layers[il].bo,
6899
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6900
+ }
6901
+
6902
+ if (il == n_layer - 1) {
6903
+ // skip computing output for unused tokens
6904
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6905
+ n_tokens = n_outputs;
6906
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6907
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6908
+ }
6909
+
6910
+ // Grok
6911
+ // if attn_out_norm is present then apply it before adding the input
6912
+ if (model.layers[il].attn_out_norm) {
6913
+ cur = llm_build_norm(ctx0, cur, hparams,
6914
+ model.layers[il].attn_out_norm, NULL,
6915
+ LLM_NORM_RMS, cb, il);
6916
+ cb(cur, "attn_out_norm", il);
6917
+ }
6918
+
6919
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6920
+ cb(ffn_inp, "ffn_inp", il);
6921
+
6922
+ // feed-forward network
6923
+ // MoE branch
6924
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6925
+ model.layers[il].ffn_norm, NULL,
6926
+ LLM_NORM_RMS, cb, il);
6927
+ cb(cur, "ffn_norm", il);
6928
+
6929
+ ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6930
+ cb(logits, "ffn_moe_logits", il);
6931
+
6932
+ ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6933
+ cb(probs, "ffn_moe_probs", il);
6934
+
6935
+ // select experts
6936
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6937
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6938
+
6939
+ ggml_tensor * weights = ggml_get_rows(ctx0,
6940
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6941
+ cb(weights, "ffn_moe_weights", il);
6942
+
6943
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6944
+
6945
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6946
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6947
+
6948
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6949
+ cb(weights, "ffn_moe_weights_norm", il);
6950
+
6951
+ // compute expert outputs
6952
+ ggml_tensor * moe_out = nullptr;
6953
+
6954
+ for (int i = 0; i < n_expert_used; ++i) {
6955
+ ggml_tensor * cur_expert;
6956
+
6957
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6958
+ cb(cur_up, "ffn_moe_up", il);
6959
+
6960
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6961
+ cb(cur_gate, "ffn_moe_gate", il);
6962
+
6963
+ //GeLU
6964
+ cur_gate = ggml_gelu(ctx0, cur_gate);
6965
+ cb(cur_gate, "ffn_moe_gelu", il);
6966
+
6967
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6968
+ cb(cur_expert, "ffn_moe_gate_par", il);
6969
+
6970
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6971
+ cb(cur_expert, "ffn_moe_down", il);
6972
+
6973
+ cur_expert = ggml_mul(ctx0, cur_expert,
6974
+ ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6975
+ cb(cur_expert, "ffn_moe_weighted", il);
6976
+
6977
+ if (i == 0) {
6978
+ moe_out = cur_expert;
6979
+ } else {
6980
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
6981
+ cb(moe_out, "ffn_moe_out", il);
6982
+ }
6983
+ }
6984
+
6985
+ cur = moe_out;
6986
+
6987
+ // Grok
6988
+ // if layer_out_norm is present then apply it before adding the input
6989
+ // Idea: maybe ffn_out_norm is a better name
6990
+ if (model.layers[il].layer_out_norm) {
6991
+ cur = llm_build_norm(ctx0, cur, hparams,
6992
+ model.layers[il].layer_out_norm, NULL,
6993
+ LLM_NORM_RMS, cb, il);
6994
+ cb(cur, "layer_out_norm", il);
6995
+ }
6996
+
6997
+
6998
+ cur = ggml_add(ctx0, cur, ffn_inp);
6999
+ cb(cur, "ffn_out", il);
7000
+
7001
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7002
+ if (layer_dir != nullptr) {
7003
+ cur = ggml_add(ctx0, cur, layer_dir);
7004
+ }
7005
+ cb(cur, "l_out", il);
7006
+
7007
+ // input for next layer
7008
+ inpL = cur;
7009
+ }
7010
+
7011
+ cur = inpL;
7012
+
7013
+ cur = llm_build_norm(ctx0, cur, hparams,
7014
+ model.output_norm, NULL,
7015
+ LLM_NORM_RMS, cb, -1);
7016
+ cb(cur, "result_norm", -1);
7017
+
7018
+ // lm_head
7019
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7020
+
7021
+ // Grok
7022
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
7023
+
7024
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7025
+
7026
+ cb(cur, "result_output", -1);
7027
+
7028
+ ggml_build_forward_expand(gf, cur);
7029
+
7030
+ return gf;
7031
+ }
7032
+
6107
7033
  struct ggml_cgraph * build_starcoder() {
6108
7034
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6109
7035
 
@@ -6158,6 +7084,13 @@ struct llm_build_context {
6158
7084
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6159
7085
  }
6160
7086
 
7087
+ if (il == n_layer - 1) {
7088
+ // skip computing output for unused tokens
7089
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7090
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7091
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7092
+ }
7093
+
6161
7094
  // add the input
6162
7095
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6163
7096
  cb(ffn_inp, "ffn_inp", il);
@@ -6355,6 +7288,13 @@ struct llm_build_context {
6355
7288
  Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6356
7289
  }
6357
7290
 
7291
+ if (il == n_layer - 1) {
7292
+ // skip computing output for unused tokens
7293
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7294
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7295
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
7296
+ }
7297
+
6358
7298
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
6359
7299
  cb(ffn_inp, "ffn_inp", il);
6360
7300
 
@@ -6444,6 +7384,13 @@ struct llm_build_context {
6444
7384
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6445
7385
  }
6446
7386
 
7387
+ if (il == n_layer - 1) {
7388
+ // skip computing output for unused tokens
7389
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7390
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7391
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7392
+ }
7393
+
6447
7394
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6448
7395
  cb(ffn_inp, "ffn_inp", il);
6449
7396
 
@@ -6601,6 +7548,13 @@ struct llm_build_context {
6601
7548
  }
6602
7549
  cb(cur, "kqv_out", il);
6603
7550
 
7551
+ if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
7552
+ // skip computing output for unused tokens
7553
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7554
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7555
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7556
+ }
7557
+
6604
7558
  // re-add the layer input
6605
7559
  cur = ggml_add(ctx0, cur, inpL);
6606
7560
 
@@ -6723,6 +7677,13 @@ struct llm_build_context {
6723
7677
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6724
7678
  }
6725
7679
 
7680
+ if (il == n_layer - 1) {
7681
+ // skip computing output for unused tokens
7682
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7683
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7684
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7685
+ }
7686
+
6726
7687
  // Add the input
6727
7688
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6728
7689
  cb(ffn_inp, "ffn_inp", il);
@@ -6770,6 +7731,7 @@ struct llm_build_context {
6770
7731
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6771
7732
 
6772
7733
  struct ggml_tensor * cur;
7734
+ struct ggml_tensor * pos;
6773
7735
  struct ggml_tensor * inpL;
6774
7736
 
6775
7737
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -6780,6 +7742,16 @@ struct llm_build_context {
6780
7742
  // positions of the tokens in the KV cache
6781
7743
  struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6782
7744
 
7745
+ if (model.pos_embd) {
7746
+ // inp_pos - contains the positions
7747
+ struct ggml_tensor * inp_pos = build_inp_pos();
7748
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7749
+ cb(pos, "pos_embd", -1);
7750
+
7751
+ inpL = ggml_add(ctx0, inpL, pos);
7752
+ cb(inpL, "inpL", -1);
7753
+ }
7754
+
6783
7755
  for (int il = 0; il < n_layer; ++il) {
6784
7756
  struct ggml_tensor * attn_norm;
6785
7757
 
@@ -6814,11 +7786,39 @@ struct llm_build_context {
6814
7786
  cb(Kcur, "Kcur", il);
6815
7787
  cb(Vcur, "Vcur", il);
6816
7788
 
6817
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7789
+ // Q/K Layernorm
7790
+ if (model.layers[il].attn_q_norm) {
7791
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
7792
+ model.layers[il].attn_q_norm,
7793
+ model.layers[il].attn_q_norm_b,
7794
+ LLM_NORM, cb, il);
7795
+ cb(Qcur, "Qcur", il);
6818
7796
 
6819
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7797
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
7798
+ model.layers[il].attn_k_norm,
7799
+ model.layers[il].attn_k_norm_b,
7800
+ LLM_NORM, cb, il);
7801
+ cb(Kcur, "Kcur", il);
7802
+
7803
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7804
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7805
+
7806
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6820
7807
  model.layers[il].wo, model.layers[il].bo,
6821
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7808
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7809
+ } else {
7810
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7811
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7812
+ model.layers[il].wo, model.layers[il].bo,
7813
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7814
+ }
7815
+ }
7816
+
7817
+ if (il == n_layer - 1) {
7818
+ // skip computing output for unused tokens
7819
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7820
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7821
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6822
7822
  }
6823
7823
 
6824
7824
  // Add the input
@@ -6934,6 +7934,13 @@ struct llm_build_context {
6934
7934
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6935
7935
  }
6936
7936
 
7937
+ if (il == n_layer - 1) {
7938
+ // skip computing output for unused tokens
7939
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7940
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7941
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7942
+ }
7943
+
6937
7944
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6938
7945
  cb(ffn_inp, "ffn_inp", il);
6939
7946
 
@@ -7040,6 +8047,13 @@ struct llm_build_context {
7040
8047
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7041
8048
  }
7042
8049
 
8050
+ if (il == n_layer - 1) {
8051
+ // skip computing output for unused tokens
8052
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8053
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8054
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8055
+ }
8056
+
7043
8057
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7044
8058
  cb(ffn_inp, "ffn_inp", il);
7045
8059
 
@@ -7152,6 +8166,13 @@ struct llm_build_context {
7152
8166
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7153
8167
  }
7154
8168
 
8169
+ if (il == n_layer - 1) {
8170
+ // skip computing output for unused tokens
8171
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8172
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8173
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8174
+ }
8175
+
7155
8176
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7156
8177
  cb(ffn_inp, "ffn_inp", il);
7157
8178
 
@@ -7270,6 +8291,14 @@ struct llm_build_context {
7270
8291
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7271
8292
  }
7272
8293
 
8294
+ if (il == n_layer - 1) {
8295
+ // skip computing output for unused tokens
8296
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8297
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8298
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8299
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
8300
+ }
8301
+
7273
8302
  // FF
7274
8303
  {
7275
8304
  ffn_output = llm_build_ffn(ctx0, attn_norm_output,
@@ -7367,6 +8396,14 @@ struct llm_build_context {
7367
8396
 
7368
8397
  cur = attention_norm;
7369
8398
 
8399
+ if (il == n_layer - 1) {
8400
+ // skip computing output for unused tokens
8401
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8402
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8403
+ sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
8404
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8405
+ }
8406
+
7370
8407
  // feed-forward network
7371
8408
  {
7372
8409
  cur = llm_build_ffn(ctx0, cur,
@@ -7459,6 +8496,13 @@ struct llm_build_context {
7459
8496
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7460
8497
  }
7461
8498
 
8499
+ if (il == n_layer - 1) {
8500
+ // skip computing output for unused tokens
8501
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8502
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8503
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8504
+ }
8505
+
7462
8506
  // add the input
7463
8507
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7464
8508
  cb(ffn_inp, "ffn_inp", il);
@@ -7559,6 +8603,13 @@ struct llm_build_context {
7559
8603
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7560
8604
  }
7561
8605
 
8606
+ if (il == n_layer - 1) {
8607
+ // skip computing output for unused tokens
8608
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8609
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8610
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8611
+ }
8612
+
7562
8613
  // add the input
7563
8614
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7564
8615
  cb(ffn_inp, "ffn_inp", il);
@@ -7668,6 +8719,13 @@ struct llm_build_context {
7668
8719
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7669
8720
  }
7670
8721
 
8722
+ if (il == n_layer - 1) {
8723
+ // skip computing output for unused tokens
8724
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8725
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8726
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8727
+ }
8728
+
7671
8729
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7672
8730
  cb(ffn_inp, "ffn_inp", il);
7673
8731
 
@@ -7778,6 +8836,13 @@ struct llm_build_context {
7778
8836
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7779
8837
  }
7780
8838
 
8839
+ if (il == n_layer - 1) {
8840
+ // skip computing output for unused tokens
8841
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8842
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8843
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8844
+ }
8845
+
7781
8846
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7782
8847
  cb(ffn_inp, "ffn_inp", il);
7783
8848
 
@@ -7901,6 +8966,13 @@ struct llm_build_context {
7901
8966
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7902
8967
  }
7903
8968
 
8969
+ if (il == n_layer - 1) {
8970
+ // skip computing output for unused tokens
8971
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8972
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8973
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8974
+ }
8975
+
7904
8976
  // scale_res - scale the hidden states for residual connection
7905
8977
  const float scale_res = scale_depth/sqrtf(float(n_layer));
7906
8978
  cur = ggml_scale(ctx0, cur, scale_res);
@@ -8015,6 +9087,13 @@ struct llm_build_context {
8015
9087
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8016
9088
  }
8017
9089
 
9090
+ if (il == n_layer - 1) {
9091
+ // skip computing output for unused tokens
9092
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9093
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9094
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9095
+ }
9096
+
8018
9097
  struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8019
9098
  cb(sa_out, "sa_out", il);
8020
9099
 
@@ -8125,7 +9204,13 @@ struct llm_build_context {
8125
9204
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8126
9205
  model.layers[il].wo, model.layers[il].bo,
8127
9206
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8128
- cb(cur, "kqv_out", il);
9207
+ }
9208
+
9209
+ if (il == n_layer - 1) {
9210
+ // skip computing output for unused tokens
9211
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9212
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9213
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8129
9214
  }
8130
9215
 
8131
9216
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -8275,6 +9360,15 @@ struct llm_build_context {
8275
9360
 
8276
9361
  struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
8277
9362
 
9363
+ if (il == n_layer - 1) {
9364
+ // skip computing output for unused tokens
9365
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9366
+ x = ggml_get_rows(ctx0, x, inp_out_ids);
9367
+ y = ggml_get_rows(ctx0, y, inp_out_ids);
9368
+ z = ggml_get_rows(ctx0, z, inp_out_ids);
9369
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9370
+ }
9371
+
8278
9372
  // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
8279
9373
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
8280
9374
  y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
@@ -8305,6 +9399,129 @@ struct llm_build_context {
8305
9399
 
8306
9400
  return gf;
8307
9401
  }
9402
+
9403
+ struct ggml_cgraph * build_command_r() {
9404
+
9405
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
9406
+
9407
+ const int64_t n_embd_head = hparams.n_embd_head_v;
9408
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9409
+ const float f_logit_scale = hparams.f_logit_scale;
9410
+
9411
+ struct ggml_tensor * cur;
9412
+ struct ggml_tensor * inpL;
9413
+
9414
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
9415
+
9416
+ // inp_pos - contains the positions
9417
+ struct ggml_tensor * inp_pos = build_inp_pos();
9418
+
9419
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
9420
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
9421
+
9422
+ for (int il = 0; il < n_layer; ++il) {
9423
+
9424
+ // norm
9425
+ cur = llm_build_norm(ctx0, inpL, hparams,
9426
+ model.layers[il].attn_norm, NULL,
9427
+ LLM_NORM, cb, il);
9428
+ cb(cur, "attn_norm", il);
9429
+ struct ggml_tensor * ffn_inp = cur;
9430
+
9431
+ // self-attention
9432
+ {
9433
+ // compute Q and K and RoPE them
9434
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
9435
+ cb(Qcur, "Qcur", il);
9436
+ if (model.layers[il].bq) {
9437
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
9438
+ cb(Qcur, "Qcur", il);
9439
+ }
9440
+
9441
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
9442
+ cb(Kcur, "Kcur", il);
9443
+ if (model.layers[il].bk) {
9444
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
9445
+ cb(Kcur, "Kcur", il);
9446
+ }
9447
+
9448
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
9449
+ cb(Vcur, "Vcur", il);
9450
+ if (model.layers[il].bv) {
9451
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
9452
+ cb(Vcur, "Vcur", il);
9453
+ }
9454
+
9455
+ Qcur = ggml_rope_custom(
9456
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9457
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9458
+ ext_factor, attn_factor, beta_fast, beta_slow
9459
+ );
9460
+ cb(Qcur, "Qcur", il);
9461
+
9462
+ Kcur = ggml_rope_custom(
9463
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
9464
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
9465
+ ext_factor, attn_factor, beta_fast, beta_slow
9466
+ );
9467
+ cb(Kcur, "Kcur", il);
9468
+
9469
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
9470
+ model.layers[il].wo, model.layers[il].bo,
9471
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9472
+ }
9473
+
9474
+ if (il == n_layer - 1) {
9475
+ // skip computing output for unused tokens
9476
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9477
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9478
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9479
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
9480
+ }
9481
+
9482
+ struct ggml_tensor * attn_out = cur;
9483
+
9484
+ // feed-forward network
9485
+ {
9486
+ cur = llm_build_ffn(ctx0, ffn_inp,
9487
+ model.layers[il].ffn_up, NULL,
9488
+ model.layers[il].ffn_gate, NULL,
9489
+ model.layers[il].ffn_down, NULL,
9490
+ NULL,
9491
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
9492
+ cb(cur, "ffn_out", il);
9493
+ }
9494
+
9495
+ // add together residual + FFN + self-attention
9496
+ cur = ggml_add(ctx0, cur, inpL);
9497
+ cur = ggml_add(ctx0, cur, attn_out);
9498
+ cb(cur, "l_out", il);
9499
+
9500
+ // input for next layer
9501
+ inpL = cur;
9502
+ }
9503
+
9504
+ cur = inpL;
9505
+
9506
+ cur = llm_build_norm(ctx0, cur, hparams,
9507
+ model.output_norm, NULL,
9508
+ LLM_NORM, cb, -1);
9509
+ cb(cur, "result_norm", -1);
9510
+
9511
+ // lm_head
9512
+ cur = ggml_mul_mat(ctx0, model.output, cur);
9513
+
9514
+ if (f_logit_scale) {
9515
+ cur = ggml_scale(ctx0, cur, f_logit_scale);
9516
+ }
9517
+
9518
+ cb(cur, "result_output", -1);
9519
+
9520
+ ggml_build_forward_expand(gf, cur);
9521
+
9522
+ return gf;
9523
+
9524
+ }
8308
9525
  };
8309
9526
 
8310
9527
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -8380,12 +9597,15 @@ static struct ggml_cgraph * llama_build_graph(
8380
9597
  }
8381
9598
 
8382
9599
  // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
8383
- // to fix this, we assign the norm layer manually to the backend of its layer
8384
- if (il != -1 && strcmp(name, "norm") == 0) {
8385
- for (auto * backend : lctx.backends) {
8386
- if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
8387
- ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
8388
- break;
9600
+ // FIXME: fix in ggml_backend_sched
9601
+ const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
9602
+ if (batch.n_tokens < 32 || full_offload) {
9603
+ if (il != -1 && strcmp(name, "norm") == 0) {
9604
+ for (auto * backend : lctx.backends) {
9605
+ if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
9606
+ ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
9607
+ break;
9608
+ }
8389
9609
  }
8390
9610
  }
8391
9611
  }
@@ -8410,6 +9630,10 @@ static struct ggml_cgraph * llama_build_graph(
8410
9630
  {
8411
9631
  result = llm.build_falcon();
8412
9632
  } break;
9633
+ case LLM_ARCH_GROK:
9634
+ {
9635
+ result = llm.build_grok();
9636
+ } break;
8413
9637
  case LLM_ARCH_STARCODER:
8414
9638
  {
8415
9639
  result = llm.build_starcoder();
@@ -8487,6 +9711,14 @@ static struct ggml_cgraph * llama_build_graph(
8487
9711
  {
8488
9712
  result = llm.build_mamba();
8489
9713
  } break;
9714
+ case LLM_ARCH_XVERSE:
9715
+ {
9716
+ result = llm.build_xverse();
9717
+ } break;
9718
+ case LLM_ARCH_COMMAND_R:
9719
+ {
9720
+ result = llm.build_command_r();
9721
+ } break;
8490
9722
  default:
8491
9723
  GGML_ASSERT(false);
8492
9724
  }
@@ -8548,9 +9780,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8548
9780
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
8549
9781
  }
8550
9782
 
9783
+ if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
9784
+ GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
9785
+ const int64_t n_tokens = batch.n_tokens;
9786
+
9787
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
9788
+ int32_t * data = (int32_t *) lctx.inp_out_ids->data;
9789
+
9790
+ if (lctx.n_outputs == n_tokens) {
9791
+ for (int i = 0; i < n_tokens; ++i) {
9792
+ data[i] = i;
9793
+ }
9794
+ } else if (batch.logits) {
9795
+ int32_t n_outputs = 0;
9796
+ for (int i = 0; i < n_tokens; ++i) {
9797
+ if (batch.logits[i]) {
9798
+ data[n_outputs++] = i;
9799
+ }
9800
+ }
9801
+ // the graph needs to have been passed the correct number of outputs
9802
+ GGML_ASSERT(lctx.n_outputs == n_outputs);
9803
+ } else if (lctx.n_outputs == 1) {
9804
+ // only keep last output
9805
+ data[0] = n_tokens - 1;
9806
+ } else {
9807
+ GGML_ASSERT(lctx.n_outputs == 0);
9808
+ }
9809
+ }
9810
+
8551
9811
  GGML_ASSERT(
9812
+ // (!a || b) is a logical implication (a -> b)
9813
+ // !hparams.causal_attn -> !cparams.causal_attn
8552
9814
  (hparams.causal_attn || !cparams.causal_attn) &&
8553
- "non-causal attention with generative models is not supported"
9815
+ "causal attention with embedding models is not supported"
8554
9816
  );
8555
9817
 
8556
9818
  if (lctx.inp_KQ_mask) {
@@ -8729,6 +9991,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
8729
9991
  }
8730
9992
  }
8731
9993
 
9994
+ // Make sure enough space is available for outputs.
9995
+ // Returns max number of outputs for which space was reserved.
9996
+ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
9997
+ const auto & cparams = lctx.cparams;
9998
+ const auto & hparams = lctx.model.hparams;
9999
+
10000
+ const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
10001
+
10002
+ const auto n_batch = cparams.n_batch;
10003
+ const auto n_vocab = hparams.n_vocab;
10004
+ const auto n_embd = hparams.n_embd;
10005
+
10006
+ // TODO: use a per-batch flag for logits presence instead
10007
+ const bool has_logits = cparams.causal_attn;
10008
+ const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
10009
+
10010
+ const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
10011
+ const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
10012
+
10013
+ if (lctx.output_ids.empty()) {
10014
+ // init, never resized afterwards
10015
+ lctx.output_ids.resize(n_batch);
10016
+ }
10017
+
10018
+ const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
10019
+ const size_t new_size = (logits_size + embd_size) * sizeof(float);
10020
+
10021
+ // alloc only when more than the current capacity is required
10022
+ // TODO: also consider shrinking the buffer
10023
+ if (!lctx.buf_output || prev_size < new_size) {
10024
+ if (lctx.buf_output) {
10025
+ #ifndef NDEBUG
10026
+ // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
10027
+ LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
10028
+ #endif
10029
+ ggml_backend_buffer_free(lctx.buf_output);
10030
+ lctx.buf_output = nullptr;
10031
+ lctx.logits = nullptr;
10032
+ lctx.embd = nullptr;
10033
+ }
10034
+
10035
+ lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
10036
+ if (lctx.buf_output == nullptr) {
10037
+ LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
10038
+ return 0;
10039
+ }
10040
+ }
10041
+
10042
+ float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
10043
+
10044
+ lctx.logits = has_logits ? output_base : nullptr;
10045
+ lctx.embd = has_embd ? output_base + logits_size : nullptr;
10046
+
10047
+ lctx.output_size = n_outputs_max;
10048
+ lctx.logits_size = logits_size;
10049
+ lctx.embd_size = embd_size;
10050
+
10051
+ // set all ids as invalid (negative)
10052
+ std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
10053
+
10054
+ ggml_backend_buffer_clear(lctx.buf_output, 0);
10055
+
10056
+ lctx.n_outputs = 0;
10057
+
10058
+ return n_outputs_max;
10059
+ }
10060
+
10061
+
8732
10062
  static void llama_graph_compute(
8733
10063
  llama_context & lctx,
8734
10064
  ggml_cgraph * gf,
@@ -8804,16 +10134,8 @@ static int llama_decode_internal(
8804
10134
  const int64_t n_embd = hparams.n_embd;
8805
10135
  const int64_t n_vocab = hparams.n_vocab;
8806
10136
 
8807
-
8808
- auto * logits_out = lctx.logits;
8809
-
8810
- #ifndef NDEBUG
8811
- auto & logits_valid = lctx.logits_valid;
8812
- logits_valid.clear();
8813
- logits_valid.resize(n_tokens_all);
8814
-
8815
- memset(logits_out, 0, lctx.logits_size*sizeof(float));
8816
- #endif
10137
+ uint32_t n_outputs = 0;
10138
+ uint32_t n_outputs_prev = 0;
8817
10139
 
8818
10140
  const auto n_ubatch = cparams.n_ubatch;
8819
10141
 
@@ -8822,6 +10144,38 @@ static int llama_decode_internal(
8822
10144
  std::vector<llama_seq_id *> seq_id_arr;
8823
10145
  std::vector<std::vector<llama_seq_id>> seq_id;
8824
10146
 
10147
+ // count outputs
10148
+ if (batch_all.logits) {
10149
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10150
+ n_outputs += batch_all.logits[i] != 0;
10151
+ }
10152
+ } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
10153
+ n_outputs = n_tokens_all;
10154
+ } else {
10155
+ // keep last output only
10156
+ n_outputs = 1;
10157
+ }
10158
+
10159
+ // reserve output buffer
10160
+ if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
10161
+ LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
10162
+ return -2;
10163
+ };
10164
+
10165
+ // set output mappings
10166
+ if (batch_all.logits) {
10167
+ int32_t i_logits = 0;
10168
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
10169
+ if (batch_all.logits[i]) {
10170
+ lctx.output_ids[i] = i_logits++;
10171
+ }
10172
+ }
10173
+ } else {
10174
+ for (uint32_t i = 0; i < n_outputs; ++i) {
10175
+ lctx.output_ids[i] = i;
10176
+ }
10177
+ }
10178
+
8825
10179
  for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
8826
10180
  const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
8827
10181
  llama_batch u_batch = {
@@ -8837,6 +10191,27 @@ static int llama_decode_internal(
8837
10191
  /* .all_seq_id = */ batch_all.all_seq_id,
8838
10192
  };
8839
10193
 
10194
+ // count the outputs in this u_batch
10195
+ {
10196
+ int32_t n_outputs_new = 0;
10197
+
10198
+ if (u_batch.logits) {
10199
+ for (uint32_t i = 0; i < n_tokens; i++) {
10200
+ n_outputs_new += u_batch.logits[i] != 0;
10201
+ }
10202
+ } else if (n_outputs == n_tokens_all) {
10203
+ n_outputs_new = n_tokens;
10204
+ } else {
10205
+ // keep last output only
10206
+ if (cur_token + n_tokens >= n_tokens_all) {
10207
+ n_outputs_new = 1;
10208
+ }
10209
+ }
10210
+
10211
+ // needs to happen before the graph is built
10212
+ lctx.n_outputs = n_outputs_new;
10213
+ }
10214
+
8840
10215
  int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8841
10216
  GGML_ASSERT(n_threads > 0);
8842
10217
 
@@ -8900,23 +10275,37 @@ static int llama_decode_internal(
8900
10275
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
8901
10276
  struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
8902
10277
 
8903
- if (!hparams.causal_attn) {
10278
+ if (lctx.n_outputs == 0) {
10279
+ // no output
10280
+ res = nullptr;
10281
+ embd = nullptr;
10282
+ } else if (!hparams.causal_attn) {
8904
10283
  res = nullptr; // do not extract logits for embedding models such as BERT
8905
10284
 
8906
10285
  // token or sequence embeddings
8907
10286
  embd = gf->nodes[gf->n_nodes - 1];
8908
10287
 
8909
10288
  GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
8910
- } else {
8911
- if (strcmp(res->name, "result_output") == 0) {
8912
- // the token embeddings could be the second to last tensor, or the third to last tensor
8913
- if (strcmp(embd->name, "result_norm") != 0) {
8914
- embd = gf->nodes[gf->n_nodes - 3];
8915
- GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
8916
- }
8917
- } else {
8918
- GGML_ASSERT(false && "missing result_output tensor");
10289
+ } else if (cparams.embeddings) {
10290
+ // the embeddings could be in the second to last tensor, or any of the previous tensors
10291
+ int i_embd = gf->n_nodes - 2;
10292
+ for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
10293
+ i_embd = gf->n_nodes - i;
10294
+ if (i_embd < 0) { break; }
10295
+ embd = gf->nodes[i_embd];
10296
+ }
10297
+ GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
10298
+
10299
+ // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
10300
+ if (!cparams.causal_attn) {
10301
+ res = nullptr; // do not extract logits when not needed
10302
+ // skip computing logits
10303
+ // TODO: is this safe?
10304
+ gf->n_nodes = i_embd + 1;
8919
10305
  }
10306
+ } else {
10307
+ embd = nullptr; // do not extract embeddings when not needed
10308
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
8920
10309
  }
8921
10310
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
8922
10311
 
@@ -8959,67 +10348,38 @@ static int llama_decode_internal(
8959
10348
  //}
8960
10349
 
8961
10350
  // extract logits
8962
- // TODO: do not compute and extract logits if only embeddings are needed
8963
- // update the graphs to skip "result_output" if logits are not needed
8964
10351
  if (res) {
8965
10352
  ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
8966
10353
  GGML_ASSERT(backend_res != nullptr);
8967
- if (u_batch.logits) {
8968
- int32_t i_first = -1;
8969
- for (uint32_t i = 0; i < n_tokens; i++) {
8970
- if (u_batch.logits[i] && i_first == -1) {
8971
- i_first = (int32_t) i;
8972
- }
8973
- if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
8974
- if (i_first != -1) {
8975
- int i_last = u_batch.logits[i] == 0 ? i : i + 1;
8976
- // extract logits for the range [i_first, i_last)
8977
- // group the requests to minimize the number of calls to the backend
8978
- ggml_backend_tensor_get_async(backend_res, res,
8979
- logits_out + n_vocab*(cur_token + i_first),
8980
- i_first*n_vocab*sizeof(float),
8981
- (i_last - i_first)*n_vocab*sizeof(float));
8982
- i_first = -1;
8983
- }
8984
- }
8985
- #ifndef NDEBUG
8986
- logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
8987
- #endif
8988
- }
8989
- } else if (lctx.logits_all) {
8990
- ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
8991
- #ifndef NDEBUG
8992
- std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
8993
- #endif
8994
- } else {
8995
- if (cur_token + n_tokens >= n_tokens_all) {
8996
- ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
8997
- #ifndef NDEBUG
8998
- logits_valid[0] = true;
8999
- #endif
9000
- }
10354
+ GGML_ASSERT(lctx.logits != nullptr);
10355
+
10356
+ float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
10357
+ const int32_t n_outputs_new = lctx.n_outputs;
10358
+
10359
+ if (n_outputs_new) {
10360
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10361
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
10362
+ ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
9001
10363
  }
9002
10364
  }
9003
10365
 
9004
10366
  // extract embeddings
9005
- if (cparams.embeddings && embd) {
10367
+ if (embd) {
9006
10368
  ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
9007
10369
  GGML_ASSERT(backend_embd != nullptr);
9008
10370
 
9009
10371
  switch (cparams.pooling_type) {
9010
10372
  case LLAMA_POOLING_TYPE_NONE:
9011
- {
9012
- // extract token embeddings
9013
- auto & embd_out = lctx.embd;
9014
-
9015
- if (u_batch.logits) {
9016
- //embd_out.resize(n_embd * n_tokens);
9017
- for (uint32_t i = 0; i < n_tokens; i++) {
9018
- if (u_batch.logits[i] == 0) {
9019
- continue;
9020
- }
9021
- ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
9022
- }
10373
+ {
10374
+ // extract token embeddings
10375
+ GGML_ASSERT(lctx.embd != nullptr);
10376
+ float * embd_out = lctx.embd + n_outputs_prev*n_embd;
10377
+ const int32_t n_outputs_new = lctx.n_outputs;
10378
+
10379
+ if (n_outputs_new) {
10380
+ GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
10381
+ GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
10382
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
9023
10383
  }
9024
10384
  } break;
9025
10385
  case LLAMA_POOLING_TYPE_CLS:
@@ -9046,6 +10406,7 @@ static int llama_decode_internal(
9046
10406
  } break;
9047
10407
  }
9048
10408
  }
10409
+ n_outputs_prev += lctx.n_outputs;
9049
10410
  }
9050
10411
 
9051
10412
  // wait for the computation to finish (automatically done when obtaining the model output)
@@ -9976,7 +11337,7 @@ struct llm_tokenizer_wpm {
9976
11337
  if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
9977
11338
  continue;
9978
11339
  }
9979
- code = to_lower(code);
11340
+ code = unicode_tolower(code);
9980
11341
  if (type == CODEPOINT_TYPE_WHITESPACE) {
9981
11342
  code = ' ';
9982
11343
  }
@@ -9996,7 +11357,7 @@ struct llm_tokenizer_wpm {
9996
11357
  std::vector<std::string> words;
9997
11358
  while (r < new_str.size()) {
9998
11359
  // if is whitespace
9999
- if (isspace(new_str[r])) {
11360
+ if (isspace(new_str[r], std::locale::classic())) {
10000
11361
  if (r > l) words.push_back(new_str.substr(l, (r - l)));
10001
11362
  l = r + 1;
10002
11363
  r = l;
@@ -10010,18 +11371,12 @@ struct llm_tokenizer_wpm {
10010
11371
  return words;
10011
11372
  }
10012
11373
 
10013
- uint32_t to_lower(uint32_t code) {
10014
- static const std::locale locale("en_US.UTF-8");
10015
- #if defined(_WIN32)
10016
- if (code > 0xFFFF) {
10017
- return code;
10018
- }
10019
- #endif
10020
- return std::tolower(wchar_t(code), locale);
10021
- }
10022
-
10023
11374
  bool is_ascii_punct(uint32_t code) {
10024
- return code < 256 && ispunct(code);
11375
+ if (code > 0xFF) {
11376
+ return false;
11377
+ }
11378
+ auto c = char(static_cast<unsigned char>(code));
11379
+ return ispunct(c, std::locale::classic());
10025
11380
  }
10026
11381
 
10027
11382
  bool is_chinese_char(uint32_t cpt) {
@@ -10266,28 +11621,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
10266
11621
  // grammar - internal
10267
11622
  //
10268
11623
 
10269
- struct llama_partial_utf8 {
10270
- uint32_t value; // bit value so far (unshifted)
10271
- int n_remain; // num bytes remaining; -1 indicates invalid sequence
10272
- };
10273
-
10274
- struct llama_grammar {
10275
- const std::vector<std::vector<llama_grammar_element>> rules;
10276
- std::vector<std::vector<const llama_grammar_element *>> stacks;
10277
-
10278
- // buffer for partially generated UTF-8 sequence from accepted tokens
10279
- llama_partial_utf8 partial_utf8;
10280
- };
10281
-
10282
- struct llama_grammar_candidate {
10283
- size_t index;
10284
- const uint32_t * code_points;
10285
- llama_partial_utf8 partial_utf8;
10286
- };
10287
11624
 
10288
11625
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
10289
11626
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
10290
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
11627
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
10291
11628
  const std::string & src,
10292
11629
  llama_partial_utf8 partial_start) {
10293
11630
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -10489,7 +11826,7 @@ static void llama_grammar_advance_stack(
10489
11826
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
10490
11827
  // produces the N possible stacks if the given char is accepted at those
10491
11828
  // positions
10492
- static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11829
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
10493
11830
  const std::vector<std::vector<llama_grammar_element>> & rules,
10494
11831
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
10495
11832
  const uint32_t chr) {
@@ -11715,7 +13052,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11715
13052
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
11716
13053
  // for getting the current layer as I initially thought, and we need to resort to parsing the
11717
13054
  // tensor name.
11718
- n_layer /= n_expert;
11719
13055
  if (sscanf(name, "blk.%d.", &i_layer) != 1) {
11720
13056
  throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
11721
13057
  }
@@ -11729,30 +13065,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11729
13065
  // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
11730
13066
  // with the quantization of the output tensor
11731
13067
  if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
11732
- int nx = tensor->ne[0];
11733
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
11734
- new_type = GGML_TYPE_Q8_0;
11735
- }
11736
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
11737
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11738
- new_type = GGML_TYPE_Q5_K;
11739
- }
11740
- else if (new_type != GGML_TYPE_Q8_0) {
11741
- new_type = GGML_TYPE_Q6_K;
13068
+ if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
13069
+ new_type = qs.params->output_tensor_type;
13070
+ } else {
13071
+ int nx = tensor->ne[0];
13072
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
13073
+ new_type = GGML_TYPE_Q8_0;
13074
+ }
13075
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
13076
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
13077
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13078
+ new_type = GGML_TYPE_Q5_K;
13079
+ }
13080
+ else if (new_type != GGML_TYPE_Q8_0) {
13081
+ new_type = GGML_TYPE_Q6_K;
13082
+ }
11742
13083
  }
11743
13084
  } else if (name == "token_embd.weight") {
11744
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
11745
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
11746
- new_type = GGML_TYPE_Q2_K;
11747
- }
11748
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
11749
- new_type = GGML_TYPE_IQ3_S;
11750
- }
11751
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11752
- new_type = GGML_TYPE_IQ3_S;
13085
+ if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
13086
+ new_type = qs.params->token_embedding_type;
13087
+ } else {
13088
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
13089
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
13090
+ new_type = GGML_TYPE_Q2_K;
13091
+ }
13092
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13093
+ new_type = GGML_TYPE_IQ3_S;
13094
+ }
13095
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
13096
+ new_type = GGML_TYPE_IQ3_S;
13097
+ }
11753
13098
  }
11754
13099
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
11755
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
13100
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
11756
13101
  if (name.find("attn_v.weight") != std::string::npos) {
11757
13102
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
11758
13103
  else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -11771,7 +13116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11771
13116
  if (qs.model.hparams.n_expert == 8) {
11772
13117
  new_type = GGML_TYPE_Q5_K;
11773
13118
  } else {
11774
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
13119
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
11775
13120
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
11776
13121
  }
11777
13122
  }
@@ -11785,13 +13130,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11785
13130
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
11786
13131
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
11787
13132
  }
11788
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
11789
- new_type = GGML_TYPE_Q4_K;
11790
- }
11791
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
11792
- new_type = GGML_TYPE_Q4_K;
11793
- }
11794
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
13133
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
11795
13134
  new_type = GGML_TYPE_Q4_K;
11796
13135
  }
11797
13136
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
@@ -11944,7 +13283,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11944
13283
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
11945
13284
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
11946
13285
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
11947
- new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
13286
+ new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
13287
+ new_type == GGML_TYPE_IQ1_M) {
11948
13288
  int nx = tensor->ne[0];
11949
13289
  int ny = tensor->ne[1];
11950
13290
  if (nx % QK_K != 0) {
@@ -11962,6 +13302,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
11962
13302
  case GGML_TYPE_IQ3_XXS:
11963
13303
  case GGML_TYPE_IQ3_S:
11964
13304
  case GGML_TYPE_IQ1_S:
13305
+ case GGML_TYPE_IQ1_M:
11965
13306
  case GGML_TYPE_Q2_K:
11966
13307
  case GGML_TYPE_Q3_K:
11967
13308
  case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
@@ -12043,6 +13384,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12043
13384
  case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
12044
13385
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
12045
13386
  case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
13387
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
12046
13388
  case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
12047
13389
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
12048
13390
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
@@ -12065,8 +13407,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12065
13407
  constexpr bool use_mmap = false;
12066
13408
  #endif
12067
13409
 
12068
- llama_model_loader ml(fname_inp, use_mmap, NULL);
12069
- ml.init_mapping(false); // no prefetching?
13410
+ llama_model_kv_override * kv_overrides = nullptr;
13411
+ if (params->kv_overrides) {
13412
+ auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
13413
+ kv_overrides = v->data();
13414
+ }
13415
+ llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
13416
+ ml.init_mappings(false); // no prefetching
12070
13417
 
12071
13418
  llama_model model;
12072
13419
  llm_load_arch(ml, model);
@@ -12090,36 +13437,43 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12090
13437
  struct gguf_context * ctx_out = gguf_init_empty();
12091
13438
 
12092
13439
  // copy the KV pairs from the input file
12093
- gguf_set_kv (ctx_out, ml.ctx_gguf);
13440
+ gguf_set_kv (ctx_out, ml.meta);
12094
13441
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
12095
13442
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
12096
13443
 
13444
+ if (params->kv_overrides) {
13445
+ const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
13446
+ for (auto & o : overrides) {
13447
+ if (o.key[0] == 0) break;
13448
+ if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
13449
+ gguf_set_val_f32(ctx_out, o.key, o.float_value);
13450
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
13451
+ gguf_set_val_i32(ctx_out, o.key, o.int_value);
13452
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
13453
+ gguf_set_val_bool(ctx_out, o.key, o.bool_value);
13454
+ } else {
13455
+ LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
13456
+ }
13457
+ }
13458
+ }
13459
+
12097
13460
  for (int i = 0; i < ml.n_tensors; ++i) {
12098
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13461
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12099
13462
 
12100
13463
  const std::string name = ggml_get_name(meta);
12101
13464
 
12102
13465
  // TODO: avoid hardcoded tensor names - use the TN_* constants
12103
13466
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
12104
13467
  ++qs.n_attention_wv;
12105
- }
12106
- else if (name.find("ffn_down") != std::string::npos) {
12107
- ++qs.n_ffn_down;
12108
- }
12109
- else if (name.find("ffn_gate") != std::string::npos) {
12110
- ++qs.n_ffn_gate;
12111
- }
12112
- else if (name.find("ffn_up") != std::string::npos) {
12113
- ++qs.n_ffn_up;
12114
- }
12115
- else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13468
+ } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
12116
13469
  qs.has_output = true;
12117
13470
  }
12118
13471
  }
12119
- if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
12120
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
12121
- __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
12122
- }
13472
+
13473
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13474
+
13475
+ // sanity checks
13476
+ GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
12123
13477
 
12124
13478
  size_t total_size_org = 0;
12125
13479
  size_t total_size_new = 0;
@@ -12135,7 +13489,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12135
13489
 
12136
13490
  // populate the original tensors so we get an initial meta data
12137
13491
  for (int i = 0; i < ml.n_tensors; ++i) {
12138
- struct ggml_tensor * meta = ml.get_tensor_meta(i);
13492
+ const struct ggml_tensor * meta = ml.get_tensor_meta(i);
12139
13493
  gguf_add_tensor(ctx_out, meta);
12140
13494
  }
12141
13495
 
@@ -12149,6 +13503,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12149
13503
  // placeholder for the meta data
12150
13504
  ::zeros(fout, meta_size);
12151
13505
 
13506
+ const auto tn = LLM_TN(model.arch);
13507
+
12152
13508
  for (int i = 0; i < ml.n_tensors; ++i) {
12153
13509
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
12154
13510
 
@@ -12171,8 +13527,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12171
13527
  // This used to be a regex, but <regex> has an extreme cost to compile times.
12172
13528
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
12173
13529
 
12174
- // quantize only 2D tensors
12175
- quantize &= (ggml_n_dims(tensor) == 2);
13530
+ // quantize only 2D and 3D tensors (experts)
13531
+ quantize &= (ggml_n_dims(tensor) >= 2);
12176
13532
  quantize &= params->quantize_output_tensor || name != "output.weight";
12177
13533
  quantize &= !params->only_copy;
12178
13534
 
@@ -12201,6 +13557,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12201
13557
  if (!params->pure && ggml_is_quantized(default_type)) {
12202
13558
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
12203
13559
  }
13560
+ else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13561
+ new_type = params->token_embedding_type;
13562
+ }
13563
+ else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13564
+ new_type = params->output_tensor_type;
13565
+ }
12204
13566
 
12205
13567
  // If we've decided to quantize to the same type the tensor is already
12206
13568
  // in then there's nothing to do.
@@ -12221,11 +13583,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12221
13583
  if (it == imatrix_data->end()) {
12222
13584
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
12223
13585
  } else {
12224
- if (it->second.size() == (size_t)tensor->ne[0]) {
13586
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
12225
13587
  imatrix = it->second.data();
12226
13588
  } else {
12227
13589
  LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
12228
- int(it->second.size()), int(tensor->ne[0]), tensor->name);
13590
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
13591
+
13592
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
13593
+ // this is a significant error and it may be good idea to abort the process if this happens,
13594
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
13595
+ // tok_embd should be ignored in this case, since it always causes this warning
13596
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
13597
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
13598
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
13599
+ }
12229
13600
  }
12230
13601
  }
12231
13602
  }
@@ -12233,6 +13604,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12233
13604
  new_type == GGML_TYPE_IQ2_XS ||
12234
13605
  new_type == GGML_TYPE_IQ2_S ||
12235
13606
  new_type == GGML_TYPE_IQ1_S ||
13607
+ (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
12236
13608
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
12237
13609
  LLAMA_LOG_ERROR("\n\n============================================================\n");
12238
13610
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -12261,15 +13633,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12261
13633
  new_data = work.data();
12262
13634
 
12263
13635
  const int n_per_row = tensor->ne[0];
12264
- const int nrows = nelements / n_per_row;
13636
+ const int nrows = tensor->ne[1];
12265
13637
 
12266
13638
  static const int min_chunk_size = 32 * 512;
12267
13639
  const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
12268
13640
 
12269
- const int nchunk = (nelements + chunk_size - 1)/chunk_size;
13641
+ const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13642
+ const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
12270
13643
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
12271
- new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
12272
13644
 
13645
+ // quantize each expert separately since they have different importance matrices
13646
+ new_size = 0;
13647
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
13648
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
13649
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
13650
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
13651
+
13652
+ new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
13653
+ }
12273
13654
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
12274
13655
  }
12275
13656
  total_size_org += ggml_nbytes(tensor);
@@ -12340,7 +13721,7 @@ static int llama_apply_lora_from_file_internal(
12340
13721
  if (path_base_model) {
12341
13722
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
12342
13723
  ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
12343
- ml->init_mapping(/*prefetch*/ false); // no prefetching
13724
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
12344
13725
  }
12345
13726
 
12346
13727
  struct tensor_meta {
@@ -12461,7 +13842,7 @@ static int llama_apply_lora_from_file_internal(
12461
13842
 
12462
13843
  ggml_tensor * base_t;
12463
13844
  if (ml) {
12464
- if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
13845
+ if (!ml->get_tensor_meta(base_name.c_str())) {
12465
13846
  LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
12466
13847
  return 1;
12467
13848
  }
@@ -12645,11 +14026,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12645
14026
  struct llama_model_quantize_params result = {
12646
14027
  /*.nthread =*/ 0,
12647
14028
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
14029
+ /*.output_tensor_type =*/ GGML_TYPE_COUNT,
14030
+ /*.token_embedding_type =*/ GGML_TYPE_COUNT,
12648
14031
  /*.allow_requantize =*/ false,
12649
14032
  /*.quantize_output_tensor =*/ true,
12650
14033
  /*.only_copy =*/ false,
12651
14034
  /*.pure =*/ false,
12652
14035
  /*.imatrix =*/ nullptr,
14036
+ /*.kv_overrides =*/ nullptr,
12653
14037
  };
12654
14038
 
12655
14039
  return result;
@@ -12658,7 +14042,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
12658
14042
  size_t llama_max_devices(void) {
12659
14043
  #if defined(GGML_USE_METAL)
12660
14044
  return 1;
12661
- #elif defined(GGML_USE_CUBLAS)
14045
+ #elif defined(GGML_USE_CUDA)
12662
14046
  return GGML_CUDA_MAX_DEVICES;
12663
14047
  #elif defined(GGML_USE_SYCL)
12664
14048
  return GGML_SYCL_MAX_DEVICES;
@@ -12678,8 +14062,8 @@ bool llama_supports_mlock(void) {
12678
14062
  }
12679
14063
 
12680
14064
  bool llama_supports_gpu_offload(void) {
12681
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
12682
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
14065
+ #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
14066
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
12683
14067
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
12684
14068
  return true;
12685
14069
  #else
@@ -12786,7 +14170,7 @@ struct llama_context * llama_new_context_with_model(
12786
14170
  const auto & hparams = model->hparams;
12787
14171
  auto & cparams = ctx->cparams;
12788
14172
 
12789
- // TODO: maybe add n_seq_max here too
14173
+ cparams.n_seq_max = std::max(1u, params.n_seq_max);
12790
14174
  cparams.n_threads = params.n_threads;
12791
14175
  cparams.n_threads_batch = params.n_threads_batch;
12792
14176
  cparams.yarn_ext_factor = params.yarn_ext_factor;
@@ -12802,6 +14186,9 @@ struct llama_context * llama_new_context_with_model(
12802
14186
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
12803
14187
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
12804
14188
 
14189
+ // this is necessary due to kv_self.n being padded later during inference
14190
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
14191
+
12805
14192
  // with causal attention, the batch size is limited by the context size
12806
14193
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
12807
14194
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
@@ -12881,32 +14268,43 @@ struct llama_context * llama_new_context_with_model(
12881
14268
  }
12882
14269
  ctx->backends.push_back(ctx->backend_metal);
12883
14270
  }
12884
- #elif defined(GGML_USE_CUBLAS)
12885
- if (model->n_gpu_layers > 0) {
14271
+ #elif defined(GGML_USE_CUDA)
14272
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12886
14273
  // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12887
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12888
- ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
14274
+ ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
14275
+ if (backend == nullptr) {
14276
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
14277
+ llama_free(ctx);
14278
+ return nullptr;
14279
+ }
14280
+ ctx->backends.push_back(backend);
14281
+ } else {
14282
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
14283
+ for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
14284
+ ggml_backend_t backend = ggml_backend_cuda_init(device);
12889
14285
  if (backend == nullptr) {
12890
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
14286
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
12891
14287
  llama_free(ctx);
12892
14288
  return nullptr;
12893
14289
  }
12894
14290
  ctx->backends.push_back(backend);
12895
- } else {
12896
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
12897
- for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
12898
- ggml_backend_t backend = ggml_backend_cuda_init(device);
12899
- if (backend == nullptr) {
12900
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
12901
- llama_free(ctx);
12902
- return nullptr;
12903
- }
12904
- ctx->backends.push_back(backend);
12905
- }
12906
14291
  }
12907
14292
  }
12908
14293
  #elif defined(GGML_USE_VULKAN)
12909
- if (model->n_gpu_layers > 0) {
14294
+ if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14295
+ LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
14296
+ llama_free(ctx);
14297
+ return nullptr;
14298
+ }
14299
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
14300
+ ggml_backend_t backend = ggml_backend_vk_init(0);
14301
+ if (backend == nullptr) {
14302
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
14303
+ llama_free(ctx);
14304
+ return nullptr;
14305
+ }
14306
+ ctx->backends.push_back(backend);
14307
+ } else {
12910
14308
  for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
12911
14309
  ggml_backend_t backend = ggml_backend_vk_init(device);
12912
14310
  if (backend == nullptr) {
@@ -12918,31 +14316,28 @@ struct llama_context * llama_new_context_with_model(
12918
14316
  }
12919
14317
  }
12920
14318
  #elif defined(GGML_USE_SYCL)
12921
- if (model->n_gpu_layers > 0) {
12922
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12923
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12924
- int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
12925
- ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
14319
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
14320
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14321
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
14322
+ if (backend == nullptr) {
14323
+ int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
14324
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
14325
+ llama_free(ctx);
14326
+ return nullptr;
14327
+ }
14328
+ ctx->backends.push_back(backend);
14329
+ } else {
14330
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
14331
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
14332
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
12926
14333
  if (backend == nullptr) {
12927
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
14334
+ int id_list[GGML_SYCL_MAX_DEVICES];
14335
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
14336
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
12928
14337
  llama_free(ctx);
12929
14338
  return nullptr;
12930
14339
  }
12931
14340
  ctx->backends.push_back(backend);
12932
- } else {
12933
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
12934
- int id_list[GGML_SYCL_MAX_DEVICES];
12935
- ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12936
- for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12937
- int device_id = id_list[i];
12938
- ggml_backend_t backend = ggml_backend_sycl_init(i);
12939
- if (backend == nullptr) {
12940
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
12941
- llama_free(ctx);
12942
- return nullptr;
12943
- }
12944
- ctx->backends.push_back(backend);
12945
- }
12946
14341
  }
12947
14342
  }
12948
14343
  #elif defined(GGML_USE_KOMPUTE)
@@ -12990,25 +14385,12 @@ struct llama_context * llama_new_context_with_model(
12990
14385
 
12991
14386
  // graph outputs buffer
12992
14387
  {
12993
- // resized during inference, reserve maximum
12994
- ctx->logits_size = hparams.n_vocab*cparams.n_batch;
12995
- ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
12996
-
12997
- const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
12998
-
12999
- ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
13000
- if (ctx->buf_output == nullptr) {
13001
- LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
14388
+ // resized during inference when a batch uses more outputs
14389
+ if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
14390
+ LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
13002
14391
  llama_free(ctx);
13003
14392
  return nullptr;
13004
14393
  }
13005
- ggml_backend_buffer_clear(ctx->buf_output, 0);
13006
-
13007
-
13008
- ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
13009
- if (params.embeddings) {
13010
- ctx->embd = ctx->logits + ctx->logits_size;
13011
- }
13012
14394
 
13013
14395
  LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
13014
14396
  ggml_backend_buffer_name(ctx->buf_output),
@@ -13033,7 +14415,7 @@ struct llama_context * llama_new_context_with_model(
13033
14415
 
13034
14416
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
13035
14417
  bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
13036
- #ifndef GGML_USE_CUBLAS
14418
+ #ifndef GGML_USE_CUDA
13037
14419
  // pipeline parallelism requires support for async compute and events
13038
14420
  // currently this is only implemented in the CUDA backend
13039
14421
  pipeline_parallel = false;
@@ -13061,14 +14443,17 @@ struct llama_context * llama_new_context_with_model(
13061
14443
  ggml_backend_t backend = ctx->backends[i];
13062
14444
  ggml_backend_buffer_type_t buft = backend_buft[i];
13063
14445
  size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
13064
- LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
13065
- ggml_backend_buft_name(buft),
13066
- size / 1024.0 / 1024.0);
14446
+ if (size > 1) {
14447
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
14448
+ ggml_backend_buft_name(buft),
14449
+ size / 1024.0 / 1024.0);
14450
+ }
13067
14451
  }
13068
14452
 
13069
14453
  // note: the number of splits during measure is higher than during inference due to the kv shift
13070
14454
  int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
13071
- LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
14455
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
14456
+ LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
13072
14457
  }
13073
14458
  }
13074
14459
 
@@ -13138,10 +14523,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
13138
14523
  case LLM_ARCH_ORION:
13139
14524
  case LLM_ARCH_INTERNLM2:
13140
14525
  case LLM_ARCH_MINICPM:
14526
+ case LLM_ARCH_XVERSE:
14527
+ case LLM_ARCH_COMMAND_R:
13141
14528
  return LLAMA_ROPE_TYPE_NORM;
13142
14529
 
13143
14530
  // the pairs of head values are offset by n_rot/2
13144
14531
  case LLM_ARCH_FALCON:
14532
+ case LLM_ARCH_GROK:
13145
14533
  case LLM_ARCH_PERSIMMON:
13146
14534
  case LLM_ARCH_BERT:
13147
14535
  case LLM_ARCH_NOMIC_BERT:
@@ -13174,6 +14562,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
13174
14562
  return model->hparams.n_embd;
13175
14563
  }
13176
14564
 
14565
+ int32_t llama_n_layer(const struct llama_model * model) {
14566
+ return model->hparams.n_layer;
14567
+ }
14568
+
13177
14569
  float llama_rope_freq_scale_train(const struct llama_model * model) {
13178
14570
  return model->hparams.rope_freq_scale_train;
13179
14571
  }
@@ -13273,6 +14665,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
13273
14665
  }
13274
14666
  }
13275
14667
 
14668
+ static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
14669
+ GGML_ASSERT(cvec.tensors.empty());
14670
+ GGML_ASSERT(cvec.ctxs.empty());
14671
+ GGML_ASSERT(cvec.bufs.empty());
14672
+
14673
+ // count layer buffer types
14674
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
14675
+ for (int64_t i = 0; i < model.hparams.n_layer; i++) {
14676
+ buft_layer_count[model.buft_layer[i].buft]++;
14677
+ }
14678
+
14679
+ // allocate contexts
14680
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
14681
+ for (auto & it : buft_layer_count) {
14682
+ int n_layers = it.second;
14683
+ struct ggml_init_params params = {
14684
+ /*.mem_size =*/ n_layers * ggml_tensor_overhead(),
14685
+ /*.mem_buffer =*/ NULL,
14686
+ /*.no_alloc =*/ true,
14687
+ };
14688
+ ggml_context * ctx = ggml_init(params);
14689
+ if (!ctx) {
14690
+ LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
14691
+ return 1;
14692
+ }
14693
+ ctx_map[it.first] = ctx;
14694
+ }
14695
+
14696
+ // make tensors
14697
+ cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
14698
+ for (size_t il = 1; il < model.hparams.n_layer; il++) {
14699
+ struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
14700
+ ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
14701
+ cvec.tensors.push_back(tensor);
14702
+ }
14703
+
14704
+ // allocate tensors / buffers and zero
14705
+ for (auto it : ctx_map) {
14706
+ ggml_backend_buffer_type_t buft = it.first;
14707
+ ggml_context * ctx = it.second;
14708
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
14709
+ if (!buf) {
14710
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
14711
+ return false;
14712
+ }
14713
+ ggml_backend_buffer_clear(buf, 0);
14714
+ cvec.ctxs.push_back(ctx);
14715
+ cvec.bufs.push_back(buf);
14716
+ }
14717
+
14718
+ return true;
14719
+ }
14720
+
14721
+ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
14722
+ const llama_model & model = lctx->model;
14723
+ llama_control_vector & cvec = lctx->cvec;
14724
+
14725
+ if (data == nullptr) {
14726
+ // disable the current control vector (but leave allocated for later)
14727
+ cvec.layer_start = -1;
14728
+ cvec.layer_end = -1;
14729
+ return 0;
14730
+ }
14731
+
14732
+ if (n_embd != (int) model.hparams.n_embd) {
14733
+ LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
14734
+ return 1;
14735
+ }
14736
+
14737
+ if (cvec.tensors.empty()) {
14738
+ if (!llama_control_vector_init(cvec, model)) {
14739
+ return 1;
14740
+ }
14741
+ }
14742
+
14743
+ cvec.layer_start = il_start;
14744
+ cvec.layer_end = il_end;
14745
+
14746
+ for (size_t il = 1; il < model.hparams.n_layer; il++) {
14747
+ assert(cvec.tensors[il] != nullptr);
14748
+
14749
+ const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
14750
+ if (off + n_embd <= len) {
14751
+ ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
14752
+ }
14753
+ }
14754
+
14755
+ return 0;
14756
+ }
14757
+
13276
14758
  struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
13277
14759
  struct llama_kv_cache_view result = {
13278
14760
  /*.n_cells = */ 0,
@@ -13426,27 +14908,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
13426
14908
 
13427
14909
  // Returns the *maximum* size of the state
13428
14910
  size_t llama_get_state_size(const struct llama_context * ctx) {
14911
+ const auto & cparams = ctx->cparams;
14912
+ const auto & hparams = ctx->model.hparams;
14913
+
13429
14914
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
13430
14915
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
13431
14916
  const size_t s_rng_size = sizeof(size_t);
13432
14917
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
14918
+ const size_t s_n_outputs = sizeof(size_t);
14919
+ // assume worst case for outputs although only currently set ones are serialized
14920
+ const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t);
13433
14921
  const size_t s_logits_size = sizeof(size_t);
13434
- // assume worst case for logits although only currently set ones are serialized
13435
- const size_t s_logits = ctx->logits_size * sizeof(float);
14922
+ const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
13436
14923
  const size_t s_embedding_size = sizeof(size_t);
13437
- const size_t s_embedding = ctx->embd_size * sizeof(float);
14924
+ const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0;
13438
14925
  const size_t s_kv_buf_size = sizeof(size_t);
13439
14926
  const size_t s_kv_head = sizeof(uint32_t);
13440
14927
  const size_t s_kv_size = sizeof(uint32_t);
13441
14928
  const size_t s_kv_used = sizeof(uint32_t);
13442
14929
  const size_t s_kv = ctx->kv_self.total_size();
13443
- // TODO: assume the max is more than 1 seq_id per KV cell
13444
- const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
14930
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
13445
14931
  const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
13446
14932
 
13447
14933
  const size_t s_total = (
13448
14934
  + s_rng_size
13449
14935
  + s_rng
14936
+ + s_n_outputs
14937
+ + s_output_pos
13450
14938
  + s_logits_size
13451
14939
  + s_logits
13452
14940
  + s_embedding_size
@@ -13521,7 +15009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13521
15009
  std::ostringstream rng_ss;
13522
15010
  rng_ss << ctx->rng;
13523
15011
 
13524
- const std::string & rng_str = rng_ss.str();
15012
+ const std::string & rng_str = rng_ss.str();
13525
15013
  const size_t rng_size = rng_str.size();
13526
15014
 
13527
15015
  GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
@@ -13530,25 +15018,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13530
15018
  data_ctx->write(rng_str.data(), rng_size);
13531
15019
  }
13532
15020
 
13533
- // copy logits
15021
+ // copy outputs
13534
15022
  {
13535
- const size_t logits_size = ctx->logits_size;
15023
+ // Can't use ctx->n_outputs because it's not for the
15024
+ // entire last batch when n_ubatch is smaller than n_batch
15025
+ size_t n_outputs = 0;
13536
15026
 
13537
- data_ctx->write(&logits_size, sizeof(logits_size));
15027
+ // copy output ids
15028
+ {
15029
+ std::vector<int32_t> output_pos;
13538
15030
 
13539
- if (logits_size) {
13540
- data_ctx->write(ctx->logits, logits_size * sizeof(float));
15031
+ const size_t n_batch = ctx->cparams.n_batch;
15032
+ const auto & output_ids = ctx->output_ids;
15033
+
15034
+ output_pos.resize(ctx->output_size);
15035
+
15036
+ // build a more compact representation of the output ids
15037
+ for (size_t i = 0; i < n_batch; ++i) {
15038
+ // map an output id to a position in the batch
15039
+ int32_t pos = output_ids[i];
15040
+ if (pos >= 0) {
15041
+ if ((size_t) pos >= n_outputs) {
15042
+ n_outputs = pos + 1;
15043
+ }
15044
+ GGML_ASSERT((size_t) pos < ctx->output_size);
15045
+ output_pos[pos] = i;
15046
+ }
15047
+ }
15048
+
15049
+ data_ctx->write(&n_outputs, sizeof(n_outputs));
15050
+
15051
+ if (n_outputs) {
15052
+ data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
15053
+ }
13541
15054
  }
13542
- }
13543
15055
 
13544
- // copy embeddings
13545
- {
13546
- const size_t embeddings_size = ctx->embd_size;
15056
+ // copy logits
15057
+ {
15058
+ const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
13547
15059
 
13548
- data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15060
+ data_ctx->write(&logits_size, sizeof(logits_size));
13549
15061
 
13550
- if (embeddings_size) {
13551
- data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15062
+ if (logits_size) {
15063
+ data_ctx->write(ctx->logits, logits_size * sizeof(float));
15064
+ }
15065
+ }
15066
+
15067
+ // copy embeddings
15068
+ {
15069
+ const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
15070
+
15071
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
15072
+
15073
+ if (embeddings_size) {
15074
+ data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
15075
+ }
13552
15076
  }
13553
15077
  }
13554
15078
 
@@ -13561,9 +15085,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13561
15085
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13562
15086
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
13563
15087
 
13564
- const size_t kv_buf_size = kv_self.total_size();
15088
+ // NOTE: kv_size and kv_buf_size are mostly used for sanity checks
13565
15089
  const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
13566
15090
  const uint32_t kv_size = kv_self.size;
15091
+ const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
13567
15092
  const uint32_t kv_used = kv_self.used;
13568
15093
 
13569
15094
  data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
@@ -13572,6 +15097,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13572
15097
  data_ctx->write(&kv_used, sizeof(kv_used));
13573
15098
 
13574
15099
  if (kv_buf_size) {
15100
+ const size_t pre_kv_buf_size = data_ctx->get_size_written();
15101
+
13575
15102
  std::vector<uint8_t> tmp_buf;
13576
15103
  for (int il = 0; il < (int) n_layer; ++il) {
13577
15104
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -13601,6 +15128,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
13601
15128
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
13602
15129
  }
13603
15130
  }
15131
+ GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
13604
15132
  }
13605
15133
 
13606
15134
  for (uint32_t i = 0; i < kv_head; ++i) {
@@ -13645,6 +15173,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13645
15173
  GGML_ASSERT(!rng_ss.fail());
13646
15174
  }
13647
15175
 
15176
+ // set output ids
15177
+ {
15178
+ size_t n_outputs;
15179
+ std::vector<int32_t> output_pos;
15180
+
15181
+ memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
15182
+
15183
+ GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
15184
+
15185
+ if (n_outputs) {
15186
+ output_pos.resize(n_outputs);
15187
+ memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
15188
+ inp += n_outputs * sizeof(int32_t);
15189
+
15190
+ for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
15191
+ int32_t id = output_pos[i];
15192
+ GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15193
+ ctx->output_ids[id] = i;
15194
+ }
15195
+ }
15196
+ }
15197
+
13648
15198
  // set logits
13649
15199
  {
13650
15200
  size_t logits_size;
@@ -13665,7 +15215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13665
15215
 
13666
15216
  memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
13667
15217
 
13668
- GGML_ASSERT(ctx->embd_size == embeddings_size);
15218
+ GGML_ASSERT(ctx->embd_size >= embeddings_size);
13669
15219
 
13670
15220
  if (embeddings_size) {
13671
15221
  memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
@@ -13692,8 +15242,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13692
15242
  memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
13693
15243
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
13694
15244
 
15245
+ if (kv_self.size != kv_size) {
15246
+ // the KV cache needs to be big enough to load all the KV cells from the saved state
15247
+ GGML_ASSERT(kv_self.size >= kv_head);
15248
+
15249
+ LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
15250
+ __func__, kv_head, kv_size, kv_self.size);
15251
+ }
15252
+
13695
15253
  if (kv_buf_size) {
13696
- GGML_ASSERT(kv_self.total_size() == kv_buf_size);
15254
+ const size_t pre_kv_buf_size = inp - src;
15255
+
15256
+ GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
13697
15257
 
13698
15258
  for (int il = 0; il < (int) n_layer; ++il) {
13699
15259
  const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
@@ -13713,23 +15273,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13713
15273
 
13714
15274
  // v is not contiguous, copy row by row
13715
15275
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
13716
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
15276
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
13717
15277
 
13718
15278
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
13719
15279
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
13720
15280
  inp += v_row_size;
13721
15281
  }
13722
15282
  }
15283
+ GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
13723
15284
  }
13724
15285
 
13725
- GGML_ASSERT(kv_self.size == kv_size);
15286
+ llama_kv_cache_clear(ctx);
13726
15287
 
13727
15288
  ctx->kv_self.head = kv_head;
13728
- ctx->kv_self.size = kv_size;
13729
15289
  ctx->kv_self.used = kv_used;
13730
15290
 
13731
- ctx->kv_self.cells.resize(kv_size);
13732
-
13733
15291
  for (uint32_t i = 0; i < kv_head; ++i) {
13734
15292
  llama_pos pos;
13735
15293
  size_t seq_id_size;
@@ -13746,11 +15304,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
13746
15304
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
13747
15305
  }
13748
15306
  }
13749
-
13750
- for (uint32_t i = kv_head; i < kv_size; ++i) {
13751
- ctx->kv_self.cells[i].pos = -1;
13752
- ctx->kv_self.cells[i].seq_id.clear();
13753
- }
13754
15307
  }
13755
15308
 
13756
15309
  const size_t nread = inp - src;
@@ -13956,11 +15509,33 @@ float * llama_get_logits(struct llama_context * ctx) {
13956
15509
  }
13957
15510
 
13958
15511
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
13959
- assert(ctx->logits_valid.at(i));
13960
-
13961
15512
  llama_synchronize(ctx);
13962
15513
 
13963
- return ctx->logits + i*ctx->model.hparams.n_vocab;
15514
+ try {
15515
+ if (ctx->logits == nullptr) {
15516
+ throw std::runtime_error("no logits");
15517
+ }
15518
+ if ((size_t) i >= ctx->output_ids.size()) {
15519
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
15520
+ }
15521
+ const int32_t j = ctx->output_ids[i];
15522
+
15523
+ if (j < 0) {
15524
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
15525
+ }
15526
+ if ((size_t) j >= ctx->output_size) {
15527
+ // This should not happen
15528
+ throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
15529
+ }
15530
+
15531
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
15532
+ } catch (const std::exception & err) {
15533
+ LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
15534
+ #ifndef NDEBUG
15535
+ GGML_ASSERT(false);
15536
+ #endif
15537
+ return nullptr;
15538
+ }
13964
15539
  }
13965
15540
 
13966
15541
  float * llama_get_embeddings(struct llama_context * ctx) {
@@ -13972,7 +15547,31 @@ float * llama_get_embeddings(struct llama_context * ctx) {
13972
15547
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
13973
15548
  llama_synchronize(ctx);
13974
15549
 
13975
- return ctx->embd + i*ctx->model.hparams.n_embd;
15550
+ try {
15551
+ if (ctx->embd == nullptr) {
15552
+ throw std::runtime_error("no embeddings");
15553
+ }
15554
+ if ((size_t) i >= ctx->output_ids.size()) {
15555
+ throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
15556
+ }
15557
+ const int32_t j = ctx->output_ids[i];
15558
+
15559
+ if (j < 0) {
15560
+ throw std::runtime_error(format("batch.logits[%d] != true", i));
15561
+ }
15562
+ if ((size_t) j >= ctx->output_size) {
15563
+ // This should not happen
15564
+ throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
15565
+ }
15566
+
15567
+ return ctx->embd + j*ctx->model.hparams.n_embd;
15568
+ } catch (const std::exception & err) {
15569
+ LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
15570
+ #ifndef NDEBUG
15571
+ GGML_ASSERT(false);
15572
+ #endif
15573
+ return nullptr;
15574
+ }
13976
15575
  }
13977
15576
 
13978
15577
  float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
@@ -14262,6 +15861,55 @@ static int32_t llama_chat_apply_template_internal(
14262
15861
  ss << message->content << "</s>";
14263
15862
  }
14264
15863
  }
15864
+ } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
15865
+ // openchat/openchat-3.5-0106,
15866
+ for (auto message : chat) {
15867
+ std::string role(message->role);
15868
+ if (role == "system") {
15869
+ ss << message->content << "<|end_of_turn|>";
15870
+ } else {
15871
+ role[0] = toupper(role[0]);
15872
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
15873
+ }
15874
+ }
15875
+ if (add_ass) {
15876
+ ss << "GPT4 Correct Assistant:";
15877
+ }
15878
+ } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
15879
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
15880
+ for (auto message : chat) {
15881
+ std::string role(message->role);
15882
+ if (role == "system") {
15883
+ // Orca-Vicuna variant uses a system prefix
15884
+ if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
15885
+ ss << "SYSTEM: " << message->content << "\n";
15886
+ } else {
15887
+ ss << message->content << "\n\n";
15888
+ }
15889
+ } else if (role == "user") {
15890
+ ss << "USER: " << message->content << "\n";
15891
+ } else if (role == "assistant") {
15892
+ ss << "ASSISTANT: " << message->content << "</s>\n";
15893
+ }
15894
+ }
15895
+ if (add_ass) {
15896
+ ss << "ASSISTANT:";
15897
+ }
15898
+ } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
15899
+ // deepseek-ai/deepseek-coder-33b-instruct
15900
+ for (auto message : chat) {
15901
+ std::string role(message->role);
15902
+ if (role == "system") {
15903
+ ss << message->content;
15904
+ } else if (role == "user") {
15905
+ ss << "### Instruction:\n" << message->content << "\n";
15906
+ } else if (role == "assistant") {
15907
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
15908
+ }
15909
+ }
15910
+ if (add_ass) {
15911
+ ss << "### Response:\n";
15912
+ }
14265
15913
  } else {
14266
15914
  // template not supported
14267
15915
  return -1;
@@ -14311,6 +15959,30 @@ LLAMA_API int32_t llama_chat_apply_template(
14311
15959
  return res;
14312
15960
  }
14313
15961
 
15962
+ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
15963
+ static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
15964
+ if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
15965
+ return strlen(split_path);
15966
+ }
15967
+ return 0;
15968
+ }
15969
+
15970
+ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
15971
+ std::string str_split_path(split_path);
15972
+ char postfix[32];
15973
+ snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
15974
+ std::string str_postfix(postfix);
15975
+
15976
+ // check if dest ends with postfix
15977
+ int size_prefix = str_split_path.size() - str_postfix.size();
15978
+ if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
15979
+ snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
15980
+ return size_prefix;
15981
+ }
15982
+
15983
+ return 0;
15984
+ }
15985
+
14314
15986
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
14315
15987
  struct llama_timings result = {
14316
15988
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,