llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -104,6 +104,7 @@
104
104
  #define LLAMA_MAX_NODES 8192
105
105
  #define LLAMA_MAX_EXPERTS 8
106
106
 
107
+
107
108
  //
108
109
  // logging
109
110
  //
@@ -211,10 +212,12 @@ enum llm_arch {
211
212
  LLM_ARCH_INTERNLM2,
212
213
  LLM_ARCH_MINICPM,
213
214
  LLM_ARCH_GEMMA,
215
+ LLM_ARCH_STARCODER2,
216
+ LLM_ARCH_MAMBA,
214
217
  LLM_ARCH_UNKNOWN,
215
218
  };
216
219
 
217
- static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
220
+ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
218
221
  { LLM_ARCH_LLAMA, "llama" },
219
222
  { LLM_ARCH_FALCON, "falcon" },
220
223
  { LLM_ARCH_GPT2, "gpt2" },
@@ -238,6 +241,9 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
238
241
  { LLM_ARCH_INTERNLM2, "internlm2" },
239
242
  { LLM_ARCH_MINICPM, "minicpm" },
240
243
  { LLM_ARCH_GEMMA, "gemma" },
244
+ { LLM_ARCH_STARCODER2, "starcoder2" },
245
+ { LLM_ARCH_MAMBA, "mamba" },
246
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
241
247
  };
242
248
 
243
249
  enum llm_kv {
@@ -252,6 +258,7 @@ enum llm_kv {
252
258
  LLM_KV_GENERAL_SOURCE_URL,
253
259
  LLM_KV_GENERAL_SOURCE_HF_REPO,
254
260
 
261
+ LLM_KV_VOCAB_SIZE,
255
262
  LLM_KV_CONTEXT_LENGTH,
256
263
  LLM_KV_EMBEDDING_LENGTH,
257
264
  LLM_KV_BLOCK_COUNT,
@@ -280,6 +287,11 @@ enum llm_kv {
280
287
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
281
288
  LLM_KV_ROPE_SCALING_FINETUNED,
282
289
 
290
+ LLM_KV_SSM_INNER_SIZE,
291
+ LLM_KV_SSM_CONV_KERNEL,
292
+ LLM_KV_SSM_STATE_SIZE,
293
+ LLM_KV_SSM_TIME_STEP_RANK,
294
+
283
295
  LLM_KV_TOKENIZER_MODEL,
284
296
  LLM_KV_TOKENIZER_LIST,
285
297
  LLM_KV_TOKENIZER_TOKEN_TYPE,
@@ -298,7 +310,7 @@ enum llm_kv {
298
310
  LLM_KV_TOKENIZER_RWKV,
299
311
  };
300
312
 
301
- static std::map<llm_kv, const char *> LLM_KV_NAMES = {
313
+ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
302
314
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
303
315
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
304
316
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -310,6 +322,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
310
322
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
311
323
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
312
324
 
325
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
313
326
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
314
327
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
315
328
  { LLM_KV_BLOCK_COUNT, "%s.block_count" },
@@ -338,6 +351,11 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
338
351
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
339
352
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
340
353
 
354
+ { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
355
+ { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
356
+ { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
357
+ { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
358
+
341
359
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
342
360
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
343
361
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
@@ -362,7 +380,7 @@ struct LLM_KV {
362
380
  llm_arch arch;
363
381
 
364
382
  std::string operator()(llm_kv kv) const {
365
- return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
383
+ return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
366
384
  }
367
385
  };
368
386
 
@@ -395,9 +413,16 @@ enum llm_tensor {
395
413
  LLM_TENSOR_ATTN_Q_NORM,
396
414
  LLM_TENSOR_ATTN_K_NORM,
397
415
  LLM_TENSOR_LAYER_OUT_NORM,
416
+ LLM_TENSOR_SSM_IN,
417
+ LLM_TENSOR_SSM_CONV1D,
418
+ LLM_TENSOR_SSM_X,
419
+ LLM_TENSOR_SSM_DT,
420
+ LLM_TENSOR_SSM_A,
421
+ LLM_TENSOR_SSM_D,
422
+ LLM_TENSOR_SSM_OUT,
398
423
  };
399
424
 
400
- static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
425
+ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
401
426
  {
402
427
  LLM_ARCH_LLAMA,
403
428
  {
@@ -779,6 +804,40 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
779
804
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
780
805
  },
781
806
  },
807
+ {
808
+ LLM_ARCH_STARCODER2,
809
+ {
810
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
811
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
812
+ { LLM_TENSOR_OUTPUT, "output" },
813
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
814
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
815
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
816
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
817
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
818
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
819
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
822
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
823
+ },
824
+ },
825
+ {
826
+ LLM_ARCH_MAMBA,
827
+ {
828
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
829
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
830
+ { LLM_TENSOR_OUTPUT, "output" },
831
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
832
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
833
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
834
+ { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
835
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
836
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
837
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
838
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
839
+ },
840
+ },
782
841
  {
783
842
  LLM_ARCH_UNKNOWN,
784
843
  {
@@ -812,38 +871,38 @@ struct LLM_TN {
812
871
  llm_arch arch;
813
872
 
814
873
  std::string operator()(llm_tensor tensor) const {
815
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
874
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
816
875
  return "__missing__";
817
876
  }
818
- return LLM_TENSOR_NAMES[arch].at(tensor);
877
+ return LLM_TENSOR_NAMES.at(arch).at(tensor);
819
878
  }
820
879
 
821
880
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
822
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
881
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
823
882
  return "__missing__";
824
883
  }
825
- return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
884
+ return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
826
885
  }
827
886
 
828
887
  std::string operator()(llm_tensor tensor, int bid) const {
829
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
888
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
830
889
  return "__missing__";
831
890
  }
832
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
891
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
833
892
  }
834
893
 
835
894
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
836
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
895
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
837
896
  return "__missing__";
838
897
  }
839
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
898
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
840
899
  }
841
900
 
842
901
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
843
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
902
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
844
903
  return "__missing__";
845
904
  }
846
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
905
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
847
906
  }
848
907
  };
849
908
 
@@ -851,16 +910,16 @@ struct LLM_TN {
851
910
  // gguf helpers
852
911
  //
853
912
 
854
- static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
913
+ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
855
914
  { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
856
915
  { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
857
916
  { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
858
917
  };
859
918
 
860
- static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
919
+ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
861
920
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
862
921
  if (kv.second == name) {
863
- return kv.first;
922
+ return (llama_rope_scaling_type) kv.first;
864
923
  }
865
924
  }
866
925
 
@@ -921,21 +980,6 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
921
980
  }
922
981
  }
923
982
 
924
- //
925
- // ggml helpers
926
- //
927
-
928
- static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
929
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
930
-
931
- if (plan.work_size > 0) {
932
- buf.resize(plan.work_size);
933
- plan.work_data = buf.data();
934
- }
935
-
936
- ggml_graph_compute(graph, &plan);
937
- }
938
-
939
983
  //
940
984
  // llama helpers
941
985
  //
@@ -1409,7 +1453,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1409
1453
  buft = ggml_backend_cuda_host_buffer_type();
1410
1454
  }
1411
1455
  #elif defined(GGML_USE_SYCL)
1412
- buft = ggml_backend_sycl_host_buffer_type();
1456
+ if (host_buffer) {
1457
+ buft = ggml_backend_sycl_host_buffer_type();
1458
+ }
1413
1459
  #elif defined(GGML_USE_CPU_HBM)
1414
1460
  buft = ggml_backend_cpu_hbm_buffer_type();
1415
1461
  #elif defined(GGML_USE_VULKAN)
@@ -1463,6 +1509,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1463
1509
  }
1464
1510
  #endif
1465
1511
 
1512
+ #ifdef GGML_USE_SYCL
1513
+ if (ggml_backend_sycl_get_device_count() > 1) {
1514
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1515
+ }
1516
+ #endif
1517
+
1466
1518
  if (buft == nullptr) {
1467
1519
  buft = llama_default_buffer_type_offload(fallback_gpu);
1468
1520
  }
@@ -1474,6 +1526,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1474
1526
  static size_t llama_get_device_count() {
1475
1527
  #if defined(GGML_USE_CUBLAS)
1476
1528
  return ggml_backend_cuda_get_device_count();
1529
+ #elif defined(GGML_USE_SYCL)
1530
+ return ggml_backend_sycl_get_device_count();
1477
1531
  #elif defined(GGML_USE_VULKAN)
1478
1532
  return ggml_backend_vk_get_device_count();
1479
1533
  #else
@@ -1487,6 +1541,11 @@ static size_t llama_get_device_memory(int device) {
1487
1541
  size_t free;
1488
1542
  ggml_backend_cuda_get_device_memory(device, &total, &free);
1489
1543
  return free;
1544
+ #elif defined(GGML_USE_SYCL)
1545
+ size_t total;
1546
+ size_t free;
1547
+ ggml_backend_sycl_get_device_memory(device, &total, &free);
1548
+ return free;
1490
1549
  #elif defined(GGML_USE_VULKAN)
1491
1550
  size_t total;
1492
1551
  size_t free;
@@ -1575,7 +1634,12 @@ struct llama_hparams {
1575
1634
  float rope_freq_base_train;
1576
1635
  float rope_freq_scale_train;
1577
1636
  uint32_t n_yarn_orig_ctx;
1578
- int32_t rope_scaling_type_train;
1637
+
1638
+ // for State Space Models
1639
+ uint32_t ssm_d_conv = 0;
1640
+ uint32_t ssm_d_inner = 0;
1641
+ uint32_t ssm_d_state = 0;
1642
+ uint32_t ssm_dt_rank = 0;
1579
1643
 
1580
1644
  float f_clamp_kqv = 0.0f;
1581
1645
  float f_max_alibi_bias = 0.0f;
@@ -1583,8 +1647,9 @@ struct llama_hparams {
1583
1647
  bool causal_attn = true;
1584
1648
  bool need_kq_pos = false;
1585
1649
 
1586
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1587
- enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1650
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1651
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1652
+ enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
1588
1653
 
1589
1654
  bool operator!=(const llama_hparams & other) const {
1590
1655
  if (this->vocab_only != other.vocab_only) return true;
@@ -1604,6 +1669,11 @@ struct llama_hparams {
1604
1669
  if (this->rope_finetuned != other.rope_finetuned) return true;
1605
1670
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1606
1671
 
1672
+ if (this->ssm_d_conv != other.ssm_d_conv) return true;
1673
+ if (this->ssm_d_inner != other.ssm_d_inner) return true;
1674
+ if (this->ssm_d_state != other.ssm_d_state) return true;
1675
+ if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
1676
+
1607
1677
  const float EPSILON = 1e-9f;
1608
1678
 
1609
1679
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
@@ -1615,6 +1685,9 @@ struct llama_hparams {
1615
1685
  }
1616
1686
 
1617
1687
  uint32_t n_gqa() const {
1688
+ if (n_head_kv == 0) {
1689
+ return 0;
1690
+ }
1618
1691
  return n_head/n_head_kv;
1619
1692
  }
1620
1693
 
@@ -1625,16 +1698,29 @@ struct llama_hparams {
1625
1698
  uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
1626
1699
  return n_embd_head_v * n_head_kv;
1627
1700
  }
1701
+
1702
+ uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
1703
+ // corresponds to Mamba's conv_states size
1704
+ // TODO: maybe support other convolution strides than 1
1705
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
1706
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
1707
+ }
1708
+
1709
+ uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
1710
+ // corresponds to Mamba's ssm_states size
1711
+ return ssm_d_state * ssm_d_inner;
1712
+ }
1628
1713
  };
1629
1714
 
1630
1715
  struct llama_cparams {
1631
- uint32_t n_ctx; // context size used during inference
1716
+ uint32_t n_ctx; // context size used during inference
1632
1717
  uint32_t n_batch;
1718
+ uint32_t n_ubatch;
1633
1719
  uint32_t n_threads; // number of threads to use for generation
1634
1720
  uint32_t n_threads_batch; // number of threads to use for batch processing
1635
1721
 
1636
- float rope_freq_base;
1637
- float rope_freq_scale;
1722
+ float rope_freq_base;
1723
+ float rope_freq_scale;
1638
1724
 
1639
1725
  uint32_t n_yarn_orig_ctx;
1640
1726
  // These hyperparameters are not exposed in GGUF, because all
@@ -1645,8 +1731,11 @@ struct llama_cparams {
1645
1731
  float yarn_beta_slow;
1646
1732
  float defrag_thold;
1647
1733
 
1734
+ bool embeddings;
1735
+ bool causal_attn;
1648
1736
  bool offload_kqv;
1649
- bool do_pooling;
1737
+
1738
+ enum llama_pooling_type pooling_type;
1650
1739
 
1651
1740
  ggml_backend_sched_eval_callback cb_eval;
1652
1741
  void * cb_eval_user_data;
@@ -1700,11 +1789,27 @@ struct llama_layer {
1700
1789
  struct ggml_tensor * ffn_down_b; // b2
1701
1790
  struct ggml_tensor * ffn_up_b; // b3
1702
1791
  struct ggml_tensor * ffn_act;
1792
+
1793
+ // mamba proj
1794
+ struct ggml_tensor * ssm_in;
1795
+ struct ggml_tensor * ssm_x;
1796
+ struct ggml_tensor * ssm_dt;
1797
+ struct ggml_tensor * ssm_out;
1798
+
1799
+ // mamba
1800
+ struct ggml_tensor * ssm_conv1d;
1801
+ struct ggml_tensor * ssm_a;
1802
+ struct ggml_tensor * ssm_d;
1803
+
1804
+ // mamba bias
1805
+ struct ggml_tensor * ssm_conv1d_b;
1806
+ struct ggml_tensor * ssm_dt_b;
1703
1807
  };
1704
1808
 
1705
1809
  struct llama_kv_cell {
1706
1810
  llama_pos pos = -1;
1707
1811
  llama_pos delta = 0;
1812
+ int32_t src = 0; // used by recurrent state models to copy states
1708
1813
 
1709
1814
  std::set<llama_seq_id> seq_id;
1710
1815
 
@@ -1725,6 +1830,9 @@ struct llama_kv_cell {
1725
1830
  struct llama_kv_cache {
1726
1831
  bool has_shift = false;
1727
1832
  bool do_defrag = false;
1833
+ bool do_copy = false;
1834
+ // with recurrent state models, a cell can hold the state for more than one past token
1835
+ bool recurrent = false;
1728
1836
 
1729
1837
  // Note: The value of head isn't only used to optimize searching
1730
1838
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -1904,8 +2012,7 @@ struct llama_context {
1904
2012
  ggml_vk_free_cpu_assist();
1905
2013
  #endif
1906
2014
 
1907
- ggml_backend_buffer_free(buf_input);
1908
- ggml_free(ctx_input);
2015
+ ggml_backend_buffer_free(buf_output);
1909
2016
  }
1910
2017
 
1911
2018
  llama_cparams cparams;
@@ -1931,36 +2038,54 @@ struct llama_context {
1931
2038
  int64_t t_p_eval_us = 0;
1932
2039
  int64_t t_eval_us = 0;
1933
2040
 
2041
+ int64_t t_compute_start_us = 0;
2042
+ int64_t n_queued_tokens = 0;
2043
+
1934
2044
  int32_t n_sample = 0; // number of tokens sampled
1935
2045
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1936
2046
  int32_t n_eval = 0; // number of eval calls
1937
2047
 
2048
+ // host buffer for the model output (logits and embeddings)
2049
+ ggml_backend_buffer_t buf_output = nullptr;
2050
+
1938
2051
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1939
- std::vector<float> logits;
2052
+ size_t logits_size = 0;
2053
+ float * logits = nullptr;
2054
+
1940
2055
  #ifndef NDEBUG
1941
2056
  // guard against access to unset logits
1942
2057
  std::vector<bool> logits_valid;
1943
2058
  #endif
1944
2059
  bool logits_all = false;
1945
2060
 
1946
- // input embedding (1-dimensional array: [n_embd])
1947
- std::vector<float> embedding;
2061
+ // embeddings output (2-dimensional array: [n_tokens][n_embd])
2062
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
2063
+ size_t embd_size = 0;
2064
+ float * embd = nullptr;
2065
+
2066
+ // sequence embeddings output (map of [n_embd] vectors)
2067
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
2068
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
1948
2069
 
1949
2070
  // memory buffers used to evaluate the model
1950
2071
  std::vector<uint8_t> buf_compute_meta;
1951
2072
  ggml_backend_sched_t sched = nullptr;
1952
2073
 
2074
+ ggml_abort_callback abort_callback = nullptr;
2075
+ void * abort_callback_data = nullptr;
2076
+
1953
2077
  // input tensors
1954
- ggml_backend_buffer_t buf_input = nullptr;
1955
- ggml_context * ctx_input = nullptr;
1956
2078
  struct ggml_tensor * inp_tokens; // I32 [n_batch]
1957
2079
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1958
2080
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1959
- struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1960
- struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
1961
- struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
2081
+ struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2082
+ struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
2083
+ struct ggml_tensor * inp_K_shift; // I32 [kv_size]
1962
2084
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1963
2085
  struct ggml_tensor * inp_cls; // I32 [n_batch]
2086
+ struct ggml_tensor * inp_s_copy; // I32 [kv_size]
2087
+ struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
2088
+ struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
1964
2089
 
1965
2090
  #ifdef GGML_USE_MPI
1966
2091
  ggml_mpi_context * ctx_mpi = NULL;
@@ -1976,25 +2101,42 @@ static bool llama_kv_cache_init(
1976
2101
  const llama_model & model,
1977
2102
  ggml_type type_k,
1978
2103
  ggml_type type_v,
1979
- uint32_t n_ctx,
2104
+ uint32_t kv_size,
1980
2105
  bool offload) {
1981
2106
  const struct llama_hparams & hparams = model.hparams;
1982
2107
 
1983
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1984
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
2108
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
2109
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
1985
2110
  const int64_t n_layer = hparams.n_layer;
1986
2111
 
1987
2112
  cache.has_shift = false;
1988
2113
 
2114
+ // TODO: find a nicer way to add other recurrent model architectures
2115
+ cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2116
+
2117
+ // TODO: support mixed reccurent Transformer architectues
2118
+ // NOTE: (!a || b) is a logical implication (a -> b)
2119
+ GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2120
+ GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
2121
+ GGML_ASSERT( cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_gqa());
2122
+ GGML_ASSERT( cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_gqa());
2123
+
1989
2124
  cache.head = 0;
1990
- cache.size = n_ctx;
2125
+ cache.size = kv_size;
1991
2126
  cache.used = 0;
1992
2127
 
1993
2128
  cache.type_k = type_k;
1994
2129
  cache.type_v = type_v;
1995
2130
 
1996
2131
  cache.cells.clear();
1997
- cache.cells.resize(n_ctx);
2132
+ cache.cells.resize(kv_size);
2133
+
2134
+ if (cache.recurrent) {
2135
+ // init state copy sources
2136
+ for (uint32_t i = 0; i < cache.size; ++i) {
2137
+ cache.cells[i].src = i;
2138
+ }
2139
+ }
1998
2140
 
1999
2141
  #ifdef GGML_USE_CLBLAST
2000
2142
  offload = false;
@@ -2033,8 +2175,8 @@ static bool llama_kv_cache_init(
2033
2175
 
2034
2176
  for (int i = 0; i < (int) n_layer; i++) {
2035
2177
  struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
2036
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
2037
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
2178
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
2179
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
2038
2180
  ggml_format_name(k, "cache_k_l%d", i);
2039
2181
  ggml_format_name(v, "cache_v_l%d", i);
2040
2182
  cache.k_l.push_back(k);
@@ -2068,6 +2210,54 @@ static bool llama_kv_cache_find_slot(
2068
2210
  const uint32_t n_ctx = cache.size;
2069
2211
  const uint32_t n_tokens = batch.n_tokens;
2070
2212
 
2213
+ if (cache.recurrent) {
2214
+ // For recurrent state architectures (like Mamba),
2215
+ // each KV cache cell can store the state for a whole sequence.
2216
+
2217
+ llama_seq_id min = cache.size - 1;
2218
+ llama_seq_id max = 0;
2219
+
2220
+ for (uint32_t i = 0; i < n_tokens; ++i) {
2221
+ for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) {
2222
+ llama_seq_id seq_id = batch.seq_id[i][j];
2223
+ // make sure it's a valid seq_id
2224
+ if ((uint32_t) seq_id < cache.size) {
2225
+ if (seq_id > max) {
2226
+ max = seq_id;
2227
+ }
2228
+ if (seq_id < min) {
2229
+ min = seq_id;
2230
+ }
2231
+ // Assuming the tokens are in-order
2232
+ if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
2233
+ // What should happen when the pos backtracks or skips a value?
2234
+ // Clearing the state mid-batch would require special-casing which isn't done.
2235
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
2236
+ __func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
2237
+ }
2238
+ if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) {
2239
+ cache.used += 1;
2240
+ }
2241
+ cache.cells[seq_id].pos = batch.pos[i];
2242
+ // NOTE: seq_ids are not inserted here; they are handled when the input tensors are set
2243
+ } else {
2244
+ // too big seq_id
2245
+ // TODO: would it be possible to resize the KV cache size instead?
2246
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
2247
+ return false;
2248
+ }
2249
+ }
2250
+ }
2251
+
2252
+ // allow getting the range of used cells, from head to head + n
2253
+ cache.head = min;
2254
+ cache.n = max - min + 1;
2255
+
2256
+ // sanity check
2257
+ return max >= min;
2258
+ }
2259
+ // otherwise, one cell per token.
2260
+
2071
2261
  if (n_tokens > n_ctx) {
2072
2262
  LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2073
2263
  return false;
@@ -2116,10 +2306,12 @@ static bool llama_kv_cache_find_slot(
2116
2306
  }
2117
2307
 
2118
2308
  // find how many cells are currently in use
2119
- static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2120
- for (uint32_t i = cache.size - 1; i > 0; --i) {
2121
- if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
2122
- return i + 1;
2309
+ static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2310
+ for (uint32_t i = cache.size; i > 0; --i) {
2311
+ const llama_kv_cell & cell = cache.cells[i - 1];
2312
+
2313
+ if (cell.pos >= 0 && !cell.is_empty()) {
2314
+ return i;
2123
2315
  }
2124
2316
  }
2125
2317
 
@@ -2135,7 +2327,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2135
2327
  cache.used = 0;
2136
2328
  }
2137
2329
 
2138
- static void llama_kv_cache_seq_rm(
2330
+ static bool llama_kv_cache_seq_rm(
2139
2331
  struct llama_kv_cache & cache,
2140
2332
  llama_seq_id seq_id,
2141
2333
  llama_pos p0,
@@ -2145,6 +2337,25 @@ static void llama_kv_cache_seq_rm(
2145
2337
  if (p0 < 0) p0 = 0;
2146
2338
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
2147
2339
 
2340
+ // models like Mamba can't have a state partially erased
2341
+ if (cache.recurrent) {
2342
+ if (seq_id >= (int64_t) cache.size) {
2343
+ // could be fatal
2344
+ return false;
2345
+ }
2346
+ if (0 <= seq_id) {
2347
+ // partial intersection is invalid
2348
+ if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) {
2349
+ return false;
2350
+ }
2351
+ } else {
2352
+ // seq_id is negative, then the range should include everything or nothing
2353
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
2354
+ return false;
2355
+ }
2356
+ }
2357
+ }
2358
+
2148
2359
  for (uint32_t i = 0; i < cache.size; ++i) {
2149
2360
  if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
2150
2361
  if (seq_id < 0) {
@@ -2166,6 +2377,8 @@ static void llama_kv_cache_seq_rm(
2166
2377
 
2167
2378
  // If we freed up a slot, set head to it so searching can start there.
2168
2379
  if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
2380
+
2381
+ return true;
2169
2382
  }
2170
2383
 
2171
2384
  static void llama_kv_cache_seq_cp(
@@ -2177,6 +2390,29 @@ static void llama_kv_cache_seq_cp(
2177
2390
  if (p0 < 0) p0 = 0;
2178
2391
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
2179
2392
 
2393
+ if (cache.recurrent) {
2394
+ if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
2395
+ seq_id_src = cache.cells[seq_id_src].src;
2396
+ GGML_ASSERT((uint32_t) seq_id_src < cache.size);
2397
+ // intent to "copy from"
2398
+ // supports copy chains thanks to taking the source of the source
2399
+ cache.cells[seq_id_dst].src = seq_id_src;
2400
+
2401
+ // preserve the "keep or clear" status of the copied sequence
2402
+ if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) {
2403
+ cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
2404
+ } else {
2405
+ cache.cells[seq_id_dst].seq_id.erase(seq_id_dst);
2406
+ }
2407
+
2408
+ cache.do_copy = true;
2409
+
2410
+ cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
2411
+ }
2412
+ return;
2413
+ }
2414
+ // otherwise, this is the KV cache of a Transformer-like model
2415
+
2180
2416
  cache.head = 0;
2181
2417
 
2182
2418
  for (uint32_t i = 0; i < cache.size; ++i) {
@@ -2216,6 +2452,17 @@ static void llama_kv_cache_seq_add(
2216
2452
  if (p0 < 0) p0 = 0;
2217
2453
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
2218
2454
 
2455
+ if (cache.recurrent) {
2456
+ // for Mamba-like models, only the pos needs to be shifted
2457
+ if (0 <= seq_id && seq_id < (int64_t) cache.size) {
2458
+ llama_kv_cell & cell = cache.cells[seq_id];
2459
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
2460
+ cell.pos += delta;
2461
+ }
2462
+ }
2463
+ return;
2464
+ }
2465
+
2219
2466
  for (uint32_t i = 0; i < cache.size; ++i) {
2220
2467
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
2221
2468
  cache.has_shift = true;
@@ -2249,6 +2496,17 @@ static void llama_kv_cache_seq_div(
2249
2496
  if (p0 < 0) p0 = 0;
2250
2497
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
2251
2498
 
2499
+ if (cache.recurrent) {
2500
+ // for Mamba-like models, only the pos needs to be changed
2501
+ if (0 <= seq_id && seq_id < (int64_t) cache.size) {
2502
+ llama_kv_cell & cell = cache.cells[seq_id];
2503
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
2504
+ cell.pos /= d;
2505
+ }
2506
+ }
2507
+ return;
2508
+ }
2509
+
2252
2510
  for (uint32_t i = 0; i < cache.size; ++i) {
2253
2511
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
2254
2512
  cache.has_shift = true;
@@ -2891,7 +3149,11 @@ template<>
2891
3149
  bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
2892
3150
  uint32_t tmp;
2893
3151
  const bool found = get_key(kid, tmp, required);
2894
- result = (enum llama_pooling_type) tmp;
3152
+ if (found) {
3153
+ result = (enum llama_pooling_type) tmp;
3154
+ } else {
3155
+ result = LLAMA_POOLING_TYPE_UNSPECIFIED;
3156
+ }
2895
3157
  return found;
2896
3158
  }
2897
3159
 
@@ -2982,10 +3244,11 @@ static const char * llama_model_type_name(e_model type) {
2982
3244
 
2983
3245
  static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2984
3246
  switch (type) {
2985
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2986
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2987
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2988
- default: return "unknown";
3247
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
3248
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
3249
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
3250
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
3251
+ default: return "unknown";
2989
3252
  }
2990
3253
  }
2991
3254
 
@@ -3017,14 +3280,14 @@ static void llm_load_hparams(
3017
3280
  ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
3018
3281
 
3019
3282
  // get hparams kv
3020
- ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3021
- ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3022
- ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3023
- ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
3024
- ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
3025
- ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
3026
- ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
3027
- ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
3283
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3284
+ ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3285
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3286
+ ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
3287
+ ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
3288
+ ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
3289
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
3290
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
3028
3291
 
3029
3292
  GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
3030
3293
  GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -3064,7 +3327,7 @@ static void llm_load_hparams(
3064
3327
 
3065
3328
  // sanity check for n_rot (optional)
3066
3329
  {
3067
- hparams.n_rot = hparams.n_embd / hparams.n_head;
3330
+ hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
3068
3331
 
3069
3332
  ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
3070
3333
 
@@ -3077,10 +3340,10 @@ static void llm_load_hparams(
3077
3340
  // gpt-j n_rot = rotary_dim
3078
3341
  }
3079
3342
 
3080
- hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
3343
+ hparams.n_embd_head_k = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
3081
3344
  ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
3082
3345
 
3083
- hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
3346
+ hparams.n_embd_head_v = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
3084
3347
  ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
3085
3348
 
3086
3349
  // arch-specific KVs
@@ -3168,7 +3431,7 @@ static void llm_load_hparams(
3168
3431
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3169
3432
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3170
3433
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3171
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3434
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
3172
3435
 
3173
3436
  switch (hparams.n_layer) {
3174
3437
  case 3:
@@ -3320,6 +3583,46 @@ static void llm_load_hparams(
3320
3583
  default: model.type = e_model::MODEL_UNKNOWN;
3321
3584
  }
3322
3585
  } break;
3586
+ case LLM_ARCH_STARCODER2:
3587
+ {
3588
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3589
+ switch (hparams.n_layer) {
3590
+ case 30: model.type = e_model::MODEL_3B; break;
3591
+ case 32: model.type = e_model::MODEL_7B; break;
3592
+ case 40: model.type = e_model::MODEL_15B; break;
3593
+ default: model.type = e_model::MODEL_UNKNOWN;
3594
+ }
3595
+ } break;
3596
+ case LLM_ARCH_MAMBA:
3597
+ {
3598
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
3599
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
3600
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
3601
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
3602
+
3603
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3604
+
3605
+ switch (hparams.n_layer) {
3606
+ case 24:
3607
+ switch (hparams.n_embd) {
3608
+ case 768: model.type = e_model::MODEL_SMALL; break;
3609
+ default: model.type = e_model::MODEL_UNKNOWN;
3610
+ } break;
3611
+ case 48:
3612
+ switch (hparams.n_embd) {
3613
+ case 1024: model.type = e_model::MODEL_MEDIUM; break;
3614
+ case 1536: model.type = e_model::MODEL_LARGE; break;
3615
+ case 2048: model.type = e_model::MODEL_XL; break;
3616
+ default: model.type = e_model::MODEL_UNKNOWN;
3617
+ } break;
3618
+ case 64:
3619
+ switch (hparams.n_embd) {
3620
+ case 2560: model.type = e_model::MODEL_3B; break;
3621
+ default: model.type = e_model::MODEL_UNKNOWN;
3622
+ } break;
3623
+ default: model.type = e_model::MODEL_UNKNOWN;
3624
+ }
3625
+ } break;
3323
3626
  default: (void)0;
3324
3627
  }
3325
3628
 
@@ -3345,30 +3648,25 @@ static void llm_load_vocab(
3345
3648
 
3346
3649
  const auto kv = LLM_KV(model.arch);
3347
3650
 
3348
- const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
3349
- if (token_idx == -1) {
3350
- throw std::runtime_error("cannot find tokenizer vocab in model file\n");
3351
- }
3352
-
3353
- const float * scores = nullptr;
3354
- const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
3355
- if (score_idx != -1) {
3356
- scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
3357
- }
3358
-
3359
- const int * toktypes = nullptr;
3360
- const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
3361
- if (toktype_idx != -1) {
3362
- toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
3363
- }
3364
-
3365
3651
  // determine vocab type
3366
3652
  {
3367
3653
  std::string tokenizer_name;
3368
3654
 
3369
3655
  ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
3370
3656
 
3371
- if (tokenizer_name == "llama") {
3657
+ if (tokenizer_name == "no_vocab") {
3658
+ vocab.type = LLAMA_VOCAB_TYPE_NONE;
3659
+
3660
+ // default special tokens
3661
+ vocab.special_bos_id = -1;
3662
+ vocab.special_eos_id = -1;
3663
+ vocab.special_unk_id = -1;
3664
+ vocab.special_sep_id = -1;
3665
+ vocab.special_pad_id = -1;
3666
+ vocab.linefeed_id = -1;
3667
+
3668
+ return;
3669
+ } else if (tokenizer_name == "llama") {
3372
3670
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
3373
3671
 
3374
3672
  // default special tokens
@@ -3395,7 +3693,7 @@ static void llm_load_vocab(
3395
3693
 
3396
3694
  for (int i = 0; i < n_merges; i++) {
3397
3695
  const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
3398
- GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
3696
+ GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
3399
3697
 
3400
3698
  std::string first;
3401
3699
  std::string second;
@@ -3434,13 +3732,30 @@ static void llm_load_vocab(
3434
3732
  }
3435
3733
  }
3436
3734
 
3735
+ const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
3736
+ if (token_idx == -1) {
3737
+ throw std::runtime_error("cannot find tokenizer vocab in model file\n");
3738
+ }
3739
+
3740
+ const float * scores = nullptr;
3741
+ const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
3742
+ if (score_idx != -1) {
3743
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
3744
+ }
3745
+
3746
+ const int * toktypes = nullptr;
3747
+ const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
3748
+ if (toktype_idx != -1) {
3749
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
3750
+ }
3751
+
3437
3752
  const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
3438
3753
 
3439
3754
  vocab.id_to_token.resize(n_vocab);
3440
3755
 
3441
3756
  for (uint32_t i = 0; i < n_vocab; i++) {
3442
3757
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
3443
- GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
3758
+ GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
3444
3759
 
3445
3760
  vocab.token_to_id[word] = i;
3446
3761
 
@@ -3632,6 +3947,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3632
3947
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3633
3948
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3634
3949
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3950
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
3635
3951
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3636
3952
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3637
3953
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
@@ -3639,6 +3955,10 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3639
3955
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
3640
3956
  LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
3641
3957
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
3958
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
3959
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
3960
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
3961
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
3642
3962
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
3643
3963
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
3644
3964
  if (ml.n_elements >= 1e12) {
@@ -3692,6 +4012,7 @@ static bool llm_load_tensors(
3692
4012
 
3693
4013
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
3694
4014
  model.buft_input = llama_default_buffer_type_cpu(true);
4015
+ //model.buft_input = llama_default_buffer_type_offload(main_gpu);
3695
4016
 
3696
4017
  model.buft_layer.resize(n_layer);
3697
4018
 
@@ -3825,7 +4146,13 @@ static bool llm_load_tensors(
3825
4146
  {
3826
4147
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3827
4148
  if (model.arch != LLM_ARCH_MINICPM){
3828
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4149
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4150
+ // if output is NULL, init from the input tok embed
4151
+ if (model.output == NULL) {
4152
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4153
+ ml.n_created--; // artificial tensor
4154
+ ml.size_data += ggml_nbytes(model.output);
4155
+ }
3829
4156
  }
3830
4157
  }
3831
4158
 
@@ -4490,6 +4817,107 @@ static bool llm_load_tensors(
4490
4817
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4491
4818
  }
4492
4819
  } break;
4820
+ case LLM_ARCH_STARCODER2:
4821
+ {
4822
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4823
+
4824
+ // output
4825
+ {
4826
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4827
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4828
+
4829
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4830
+ // if output is NULL, init from the input tok embed
4831
+ if (model.output == NULL) {
4832
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4833
+ ml.n_created--; // artificial tensor
4834
+ ml.size_data += ggml_nbytes(model.output);
4835
+ }
4836
+
4837
+ }
4838
+
4839
+ for (int i = 0; i < n_layer; ++i) {
4840
+ ggml_context * ctx_layer = ctx_for_layer(i);
4841
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4842
+
4843
+ auto & layer = model.layers[i];
4844
+
4845
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4846
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4847
+
4848
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4849
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4850
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4851
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4852
+
4853
+ // optional bias tensors
4854
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
4855
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
4856
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
4857
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4858
+
4859
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4860
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4861
+
4862
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4863
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4864
+
4865
+ // optional bias tensors
4866
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4867
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
4868
+ }
4869
+ } break;
4870
+ case LLM_ARCH_MAMBA:
4871
+ {
4872
+ const int64_t d_conv = hparams.ssm_d_conv;
4873
+ const int64_t d_inner = hparams.ssm_d_inner;
4874
+ const int64_t d_state = hparams.ssm_d_state;
4875
+ const int64_t dt_rank = hparams.ssm_dt_rank;
4876
+ // only an expansion factor of 2 is supported for now
4877
+ GGML_ASSERT(2 * n_embd == d_inner);
4878
+
4879
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4880
+
4881
+ // output
4882
+ {
4883
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4884
+
4885
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4886
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
4887
+ if (model.output == NULL) {
4888
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4889
+ ml.n_created--; // artificial tensor
4890
+ ml.size_data += ggml_nbytes(model.output);
4891
+ }
4892
+ }
4893
+
4894
+ for (int i = 0; i < n_layer; ++i) {
4895
+ ggml_context * ctx_layer = ctx_for_layer(i);
4896
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4897
+
4898
+ auto & layer = model.layers[i];
4899
+
4900
+ // norm
4901
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4902
+
4903
+ layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner});
4904
+
4905
+ layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner});
4906
+ layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner});
4907
+
4908
+ layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state});
4909
+
4910
+ layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner});
4911
+ layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner});
4912
+
4913
+ // no "weight" suffix for these
4914
+ layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner});
4915
+ layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, i), {d_inner});
4916
+
4917
+ // out_proj
4918
+ layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
4919
+ }
4920
+ } break;
4493
4921
  default:
4494
4922
  throw std::runtime_error("unknown architecture");
4495
4923
  }
@@ -4610,7 +5038,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4610
5038
 
4611
5039
  llm_load_print_meta(ml, model);
4612
5040
 
4613
- if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
5041
+ if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
5042
+ model.hparams.n_vocab != model.vocab.id_to_token.size()) {
4614
5043
  throw std::runtime_error("vocab size mismatch");
4615
5044
  }
4616
5045
 
@@ -4674,29 +5103,32 @@ enum llm_norm_type {
4674
5103
 
4675
5104
  static struct ggml_tensor * llm_build_inp_embd(
4676
5105
  struct ggml_context * ctx,
5106
+ struct llama_context & lctx,
4677
5107
  const llama_hparams & hparams,
4678
5108
  const llama_batch & batch,
4679
5109
  struct ggml_tensor * tok_embd,
4680
- struct ggml_tensor * inp_tokens,
4681
- struct ggml_tensor * inp_embd,
4682
5110
  const llm_build_cb & cb) {
4683
5111
  const int64_t n_embd = hparams.n_embd;
4684
5112
 
4685
5113
  struct ggml_tensor * inpL;
4686
5114
 
4687
5115
  if (batch.token) {
4688
- struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
4689
- cb(inp_tokens, "inp_tokens", -1);
5116
+ lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
5117
+ cb(lctx.inp_tokens, "inp_tokens", -1);
5118
+ ggml_set_input(lctx.inp_tokens);
4690
5119
 
4691
- inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
5120
+ inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
4692
5121
  } else {
4693
5122
  #ifdef GGML_USE_MPI
4694
5123
  GGML_ASSERT(false && "not implemented");
4695
5124
  #endif
4696
-
4697
- inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
5125
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
5126
+ inpL = lctx.inp_embd;
5127
+ ggml_set_input(lctx.inp_embd);
4698
5128
  }
4699
5129
 
5130
+ cb(inpL, "inp_embd", -1);
5131
+
4700
5132
  return inpL;
4701
5133
  }
4702
5134
 
@@ -4715,6 +5147,8 @@ static void llm_build_kv_store(
4715
5147
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4716
5148
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4717
5149
 
5150
+ GGML_ASSERT(kv.size == n_ctx);
5151
+
4718
5152
  // compute the transposed [n_tokens, n_embd] V matrix
4719
5153
  struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
4720
5154
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
@@ -4901,8 +5335,8 @@ static struct ggml_tensor * llm_build_kqv(
4901
5335
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4902
5336
  }
4903
5337
 
4904
- #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
4905
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
5338
+ #if defined(GGML_USE_KOMPUTE)
5339
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
4906
5340
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4907
5341
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4908
5342
  if (hparams.f_max_alibi_bias > 0.0f) {
@@ -4924,6 +5358,8 @@ static struct ggml_tensor * llm_build_kqv(
4924
5358
  cb(kq, "kq_soft_max_ext", il);
4925
5359
  }
4926
5360
 
5361
+ GGML_ASSERT(kv.size == n_ctx);
5362
+
4927
5363
  // split cached v into n_head heads
4928
5364
  struct ggml_tensor * v =
4929
5365
  ggml_view_3d(ctx, kv.v_l[il],
@@ -4986,6 +5422,7 @@ static struct ggml_tensor * llm_build_kv(
4986
5422
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4987
5423
 
4988
5424
  struct ggml_tensor * cur;
5425
+
4989
5426
  cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4990
5427
  q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4991
5428
  cb(cur, "kqv_out", il);
@@ -4995,7 +5432,7 @@ static struct ggml_tensor * llm_build_kv(
4995
5432
 
4996
5433
  struct llm_build_context {
4997
5434
  const llama_model & model;
4998
- const llama_context & lctx;
5435
+ llama_context & lctx;
4999
5436
  const llama_hparams & hparams;
5000
5437
  const llama_cparams & cparams;
5001
5438
  const llama_batch & batch;
@@ -5070,10 +5507,10 @@ struct llm_build_context {
5070
5507
  norm_eps (hparams.f_norm_eps),
5071
5508
  norm_rms_eps (hparams.f_norm_rms_eps),
5072
5509
  n_tokens (batch.n_tokens),
5073
- n_kv (worst_case ? n_ctx : kv_self.n),
5074
- kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5510
+ n_kv (worst_case ? kv_self.size : kv_self.n),
5511
+ kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
5075
5512
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5076
- pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
5513
+ pooling_type (cparams.pooling_type),
5077
5514
  rope_type (hparams.rope_type),
5078
5515
  cb (cb),
5079
5516
  buf_compute_meta (lctx.buf_compute_meta) {
@@ -5088,6 +5525,18 @@ struct llm_build_context {
5088
5525
  };
5089
5526
 
5090
5527
  ctx0 = ggml_init(params);
5528
+
5529
+ lctx.inp_tokens = nullptr;
5530
+ lctx.inp_embd = nullptr;
5531
+ lctx.inp_pos = nullptr;
5532
+ lctx.inp_KQ_mask = nullptr;
5533
+ lctx.inp_KQ_pos = nullptr;
5534
+ lctx.inp_K_shift = nullptr;
5535
+ lctx.inp_mean = nullptr;
5536
+ lctx.inp_cls = nullptr;
5537
+ lctx.inp_s_copy = nullptr;
5538
+ lctx.inp_s_mask = nullptr;
5539
+ lctx.inp_s_seq = nullptr;
5091
5540
  }
5092
5541
 
5093
5542
  void free() {
@@ -5100,6 +5549,12 @@ struct llm_build_context {
5100
5549
  struct ggml_cgraph * build_k_shift() {
5101
5550
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5102
5551
 
5552
+ GGML_ASSERT(kv_self.size == n_ctx);
5553
+
5554
+ lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
5555
+ cb(lctx.inp_K_shift, "K_shift", -1);
5556
+ ggml_set_input(lctx.inp_K_shift);
5557
+
5103
5558
  for (int il = 0; il < n_layer; ++il) {
5104
5559
  struct ggml_tensor * tmp =
5105
5560
  // we rotate only the first n_rot dimensions
@@ -5118,6 +5573,29 @@ struct llm_build_context {
5118
5573
  return gf;
5119
5574
  }
5120
5575
 
5576
+ struct ggml_cgraph * build_s_copy() {
5577
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5578
+
5579
+ GGML_ASSERT(kv_self.recurrent);
5580
+
5581
+ struct ggml_tensor * state_copy = build_inp_s_copy();
5582
+
5583
+ for (int il = 0; il < n_layer; ++il) {
5584
+ struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
5585
+ struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
5586
+
5587
+ conv_states = ggml_get_rows(ctx0, conv_states, state_copy);
5588
+ ssm_states = ggml_get_rows(ctx0, ssm_states, state_copy);
5589
+
5590
+ // TODO: name the intermediate tensors with cb()
5591
+
5592
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_states, kv_self.k_l[il]));
5593
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, ssm_states, kv_self.v_l[il]));
5594
+ }
5595
+
5596
+ return gf;
5597
+ }
5598
+
5121
5599
  struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5122
5600
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5123
5601
 
@@ -5167,6 +5645,66 @@ struct llm_build_context {
5167
5645
  return gf;
5168
5646
  }
5169
5647
 
5648
+ struct ggml_tensor * build_inp_pos() {
5649
+ lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5650
+ cb(lctx.inp_pos, "inp_pos", -1);
5651
+ ggml_set_input(lctx.inp_pos);
5652
+ return lctx.inp_pos;
5653
+ }
5654
+
5655
+ struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
5656
+ if (causal) {
5657
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
5658
+ } else {
5659
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
5660
+ }
5661
+ cb(lctx.inp_KQ_mask, "KQ_mask", -1);
5662
+ ggml_set_input(lctx.inp_KQ_mask);
5663
+ return lctx.inp_KQ_mask;
5664
+ }
5665
+
5666
+ struct ggml_tensor * build_inp_KQ_pos() {
5667
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
5668
+ cb(lctx.inp_KQ_pos, "KQ_pos", -1);
5669
+ ggml_set_input(lctx.inp_KQ_pos);
5670
+ return lctx.inp_KQ_pos;
5671
+ }
5672
+
5673
+ struct ggml_tensor * build_inp_mean() {
5674
+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
5675
+ cb(lctx.inp_mean, "inp_mean", -1);
5676
+ ggml_set_input(lctx.inp_mean);
5677
+ return lctx.inp_mean;
5678
+ }
5679
+
5680
+ struct ggml_tensor * build_inp_cls() {
5681
+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5682
+ cb(lctx.inp_cls, "inp_cls", -1);
5683
+ ggml_set_input(lctx.inp_cls);
5684
+ return lctx.inp_cls;
5685
+ }
5686
+
5687
+ struct ggml_tensor * build_inp_s_copy() {
5688
+ lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, kv_self.size);
5689
+ cb(lctx.inp_s_copy, "inp_s_copy", -1);
5690
+ ggml_set_input(lctx.inp_s_copy);
5691
+ return lctx.inp_s_copy;
5692
+ }
5693
+
5694
+ struct ggml_tensor * build_inp_s_mask() {
5695
+ lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
5696
+ cb(lctx.inp_s_mask, "inp_s_mask", -1);
5697
+ ggml_set_input(lctx.inp_s_mask);
5698
+ return lctx.inp_s_mask;
5699
+ }
5700
+
5701
+ struct ggml_tensor * build_inp_s_seq() {
5702
+ lctx.inp_s_seq = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
5703
+ cb(lctx.inp_s_seq, "inp_s_seq", -1);
5704
+ ggml_set_input(lctx.inp_s_seq);
5705
+ return lctx.inp_s_seq;
5706
+ }
5707
+
5170
5708
  struct ggml_cgraph * build_llama() {
5171
5709
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5172
5710
 
@@ -5177,16 +5715,13 @@ struct llm_build_context {
5177
5715
  struct ggml_tensor * cur;
5178
5716
  struct ggml_tensor * inpL;
5179
5717
 
5180
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5181
- cb(inpL, "inp_embd", -1);
5718
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5182
5719
 
5183
5720
  // inp_pos - contains the positions
5184
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5185
- cb(inp_pos, "inp_pos", -1);
5721
+ struct ggml_tensor * inp_pos = build_inp_pos();
5186
5722
 
5187
5723
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5188
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5189
- cb(KQ_mask, "KQ_mask", -1);
5724
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5190
5725
 
5191
5726
  for (int il = 0; il < n_layer; ++il) {
5192
5727
  struct ggml_tensor * inpSA = inpL;
@@ -5238,7 +5773,6 @@ struct llm_build_context {
5238
5773
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5239
5774
  model.layers[il].wo, model.layers[il].bo,
5240
5775
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5241
- cb(cur, "kqv_out", il);
5242
5776
  }
5243
5777
 
5244
5778
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -5356,20 +5890,16 @@ struct llm_build_context {
5356
5890
  struct ggml_tensor * cur;
5357
5891
  struct ggml_tensor * inpL;
5358
5892
 
5359
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5360
- cb(inpL, "inp_embd", -1);
5893
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5361
5894
 
5362
5895
  // inp_pos - contains the positions
5363
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5364
- cb(inp_pos, "inp_pos", -1);
5896
+ struct ggml_tensor * inp_pos = build_inp_pos();
5365
5897
 
5366
5898
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5367
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5368
- cb(KQ_mask, "KQ_mask", -1);
5899
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5369
5900
 
5370
5901
  // positions of the tokens in the KV cache
5371
- struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5372
- cb(KQ_pos, "KQ_pos", -1);
5902
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
5373
5903
 
5374
5904
  for (int il = 0; il < n_layer; ++il) {
5375
5905
  struct ggml_tensor * inpSA = inpL;
@@ -5417,7 +5947,6 @@ struct llm_build_context {
5417
5947
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5418
5948
  model.layers[il].wo, NULL,
5419
5949
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5420
- cb(cur, "kqv_out", il);
5421
5950
  }
5422
5951
 
5423
5952
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -5473,16 +6002,13 @@ struct llm_build_context {
5473
6002
  struct ggml_tensor * cur;
5474
6003
  struct ggml_tensor * inpL;
5475
6004
 
5476
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5477
- cb(inpL, "inp_embd", -1);
6005
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5478
6006
 
5479
6007
  // inp_pos - contains the positions
5480
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5481
- cb(inp_pos, "inp_pos", -1);
6008
+ struct ggml_tensor * inp_pos = build_inp_pos();
5482
6009
 
5483
6010
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5484
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5485
- cb(KQ_mask, "KQ_mask", -1);
6011
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5486
6012
 
5487
6013
  for (int il = 0; il < n_layer; ++il) {
5488
6014
  struct ggml_tensor * attn_norm;
@@ -5536,7 +6062,6 @@ struct llm_build_context {
5536
6062
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5537
6063
  model.layers[il].wo, NULL,
5538
6064
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5539
- cb(cur, "kqv_out", il);
5540
6065
  }
5541
6066
 
5542
6067
  struct ggml_tensor * ffn_inp = cur;
@@ -5587,21 +6112,17 @@ struct llm_build_context {
5587
6112
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5588
6113
 
5589
6114
  struct ggml_tensor * cur;
5590
- struct ggml_tensor * pos;
5591
6115
  struct ggml_tensor * inpL;
5592
6116
 
5593
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5594
- cb(inpL, "inp_embd", -1);
6117
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5595
6118
 
5596
6119
  // inp_pos - contains the positions
5597
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5598
- cb(inp_pos, "inp_pos", -1);
6120
+ struct ggml_tensor * inp_pos = build_inp_pos();
5599
6121
 
5600
6122
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5601
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5602
- cb(KQ_mask, "KQ_mask", -1);
6123
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5603
6124
 
5604
- pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
6125
+ struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
5605
6126
  cb(pos, "pos_embd", -1);
5606
6127
 
5607
6128
  inpL = ggml_add(ctx0, inpL, pos);
@@ -5635,7 +6156,6 @@ struct llm_build_context {
5635
6156
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5636
6157
  model.layers[il].wo, model.layers[il].bo,
5637
6158
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5638
- cb(cur, "kqv_out", il);
5639
6159
  }
5640
6160
 
5641
6161
  // add the input
@@ -5687,16 +6207,13 @@ struct llm_build_context {
5687
6207
  struct ggml_tensor * cur;
5688
6208
  struct ggml_tensor * inpL;
5689
6209
 
5690
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5691
- cb(inpL, "inp_embd", -1);
6210
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5692
6211
 
5693
6212
  // inp_pos - contains the positions
5694
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5695
- cb(inp_pos, "inp_pos", -1);
6213
+ struct ggml_tensor * inp_pos = build_inp_pos();
5696
6214
 
5697
6215
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5698
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5699
- cb(KQ_mask, "KQ_mask", -1);
6216
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5700
6217
 
5701
6218
  for (int il = 0; il < n_layer; ++il) {
5702
6219
  struct ggml_tensor * residual = inpL;
@@ -5836,7 +6353,6 @@ struct llm_build_context {
5836
6353
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5837
6354
  model.layers[il].wo, model.layers[il].bo,
5838
6355
  Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5839
- cb(cur, "kqv_out", il);
5840
6356
  }
5841
6357
 
5842
6358
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
@@ -5890,16 +6406,13 @@ struct llm_build_context {
5890
6406
  struct ggml_tensor * cur;
5891
6407
  struct ggml_tensor * inpL;
5892
6408
 
5893
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5894
- cb(inpL, "inp_embd", -1);
6409
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5895
6410
 
5896
6411
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5897
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5898
- cb(KQ_mask, "KQ_mask", -1);
6412
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5899
6413
 
5900
6414
  // positions of the tokens in the KV cache
5901
- struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5902
- cb(KQ_pos, "KQ_pos", -1);
6415
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
5903
6416
 
5904
6417
  for (int il = 0; il < n_layer; ++il) {
5905
6418
  struct ggml_tensor * inpSA = inpL;
@@ -5929,7 +6442,6 @@ struct llm_build_context {
5929
6442
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5930
6443
  model.layers[il].wo, NULL,
5931
6444
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5932
- cb(cur, "kqv_out", il);
5933
6445
  }
5934
6446
 
5935
6447
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -5979,19 +6491,18 @@ struct llm_build_context {
5979
6491
 
5980
6492
  const int64_t n_embd_head = hparams.n_embd_head_v;
5981
6493
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6494
+
5982
6495
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5983
6496
 
5984
6497
  struct ggml_tensor * cur;
5985
6498
  struct ggml_tensor * inpL;
5986
6499
 
5987
- // get input vectors with right size
5988
- const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5989
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5990
- struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5991
- struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
6500
+ struct ggml_tensor * inp_pos = build_inp_pos();
6501
+ struct ggml_tensor * inp_mean = build_inp_mean();
6502
+ struct ggml_tensor * inp_cls = build_inp_cls();
5992
6503
 
5993
6504
  // construct input embeddings (token, type, position)
5994
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6505
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5995
6506
 
5996
6507
  // token types are hardcoded to zero ("Sentence A")
5997
6508
  struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
@@ -6006,39 +6517,37 @@ struct llm_build_context {
6006
6517
  cb(inpL, "inp_norm", -1);
6007
6518
 
6008
6519
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6009
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6010
- cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
6520
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
6011
6521
 
6012
6522
  // iterate layers
6013
6523
  for (int il = 0; il < n_layer; ++il) {
6014
6524
  struct ggml_tensor * cur = inpL;
6015
6525
 
6526
+ struct ggml_tensor * Qcur;
6527
+ struct ggml_tensor * Kcur;
6528
+ struct ggml_tensor * Vcur;
6529
+
6016
6530
  // self-attention
6017
6531
  if (model.arch == LLM_ARCH_BERT) {
6018
- struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
6532
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
6019
6533
  cb(Qcur, "Qcur", il);
6020
6534
 
6021
- struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
6535
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
6022
6536
  cb(Kcur, "Kcur", il);
6023
6537
 
6024
- struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
6538
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
6025
6539
  cb(Vcur, "Vcur", il);
6026
6540
 
6027
- // seems like we just need to do this for Q?
6028
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6029
-
6030
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6031
- model.layers[il].wo, model.layers[il].bo,
6032
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6033
- cb(cur, "kqv_out", il);
6541
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6542
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6034
6543
  } else {
6035
6544
  // compute Q and K and RoPE them
6036
6545
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6037
6546
  cb(cur, "wqkv", il);
6038
6547
 
6039
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6040
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6041
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6548
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6549
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6550
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6042
6551
 
6043
6552
  cb(Qcur, "Qcur", il);
6044
6553
  cb(Kcur, "Kcur", il);
@@ -6057,12 +6566,40 @@ struct llm_build_context {
6057
6566
  ext_factor, attn_factor, beta_fast, beta_slow
6058
6567
  );
6059
6568
  cb(Kcur, "Kcur", il);
6569
+ }
6060
6570
 
6061
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6062
- model.layers[il].wo, model.layers[il].bo,
6063
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6064
- cb(cur, "kqv_out", il);
6571
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
6572
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
6573
+
6574
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
6575
+ cb(kq, "kq", il);
6576
+
6577
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
6578
+ cb(kq, "kq_soft_max_ext", il);
6579
+
6580
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
6581
+ cb(v, "v", il);
6582
+
6583
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
6584
+ cb(kqv, "kqv", il);
6585
+
6586
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
6587
+ cb(kqv_merged, "kqv_merged", il);
6588
+
6589
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
6590
+ cb(cur, "kqv_merged_cont", il);
6591
+
6592
+ ggml_build_forward_expand(gf, cur);
6593
+
6594
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
6595
+ if (model.layers[il].bo) {
6596
+ cb(cur, "kqv_wo", il);
6597
+ }
6598
+
6599
+ if (model.layers[il].bo) {
6600
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
6065
6601
  }
6602
+ cb(cur, "kqv_out", il);
6066
6603
 
6067
6604
  // re-add the layer input
6068
6605
  cur = ggml_add(ctx0, cur, inpL);
@@ -6103,16 +6640,29 @@ struct llm_build_context {
6103
6640
 
6104
6641
  // final output
6105
6642
  cur = inpL;
6643
+ cb(cur, "result_embd", -1);
6106
6644
 
6107
6645
  // pooling layer
6108
- if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
6109
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6110
- } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
6111
- cur = ggml_get_rows(ctx0, cur, inp_cls);
6112
- } else {
6113
- GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
6646
+ switch (pooling_type) {
6647
+ case LLAMA_POOLING_TYPE_NONE:
6648
+ {
6649
+ // nop
6650
+ } break;
6651
+ case LLAMA_POOLING_TYPE_MEAN:
6652
+ {
6653
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6654
+ cb(cur, "result_embd_pooled", -1);
6655
+ } break;
6656
+ case LLAMA_POOLING_TYPE_CLS:
6657
+ {
6658
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6659
+ cb(cur, "result_embd_pooled", -1);
6660
+ } break;
6661
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
6662
+ {
6663
+ GGML_ASSERT(false && "Invalid pooling type");
6664
+ } break;
6114
6665
  }
6115
- cb(cur, "result_embd", -1);
6116
6666
 
6117
6667
  ggml_build_forward_expand(gf, cur);
6118
6668
 
@@ -6129,16 +6679,13 @@ struct llm_build_context {
6129
6679
  struct ggml_tensor * cur;
6130
6680
  struct ggml_tensor * inpL;
6131
6681
 
6132
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6133
- cb(inpL, "inp_embd", -1);
6682
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6134
6683
 
6135
6684
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6136
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6137
- cb(KQ_mask, "KQ_mask", -1);
6685
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6138
6686
 
6139
6687
  // positions of the tokens in the KV cache
6140
- struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6141
- cb(KQ_pos, "KQ_pos", -1);
6688
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6142
6689
 
6143
6690
  inpL = llm_build_norm(ctx0, inpL, hparams,
6144
6691
  model.tok_norm,
@@ -6174,7 +6721,6 @@ struct llm_build_context {
6174
6721
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6175
6722
  model.layers[il].wo, model.layers[il].bo,
6176
6723
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6177
- cb(cur, "kqv_out", il);
6178
6724
  }
6179
6725
 
6180
6726
  // Add the input
@@ -6226,16 +6772,13 @@ struct llm_build_context {
6226
6772
  struct ggml_tensor * cur;
6227
6773
  struct ggml_tensor * inpL;
6228
6774
 
6229
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6230
- cb(inpL, "inp_embd", -1);
6775
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6231
6776
 
6232
6777
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6233
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6234
- cb(KQ_mask, "KQ_mask", -1);
6778
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6235
6779
 
6236
6780
  // positions of the tokens in the KV cache
6237
- struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6238
- cb(KQ_pos, "KQ_pos", -1);
6781
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6239
6782
 
6240
6783
  for (int il = 0; il < n_layer; ++il) {
6241
6784
  struct ggml_tensor * attn_norm;
@@ -6276,7 +6819,6 @@ struct llm_build_context {
6276
6819
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6277
6820
  model.layers[il].wo, model.layers[il].bo,
6278
6821
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6279
- cb(cur, "kqv_out", il);
6280
6822
  }
6281
6823
 
6282
6824
  // Add the input
@@ -6331,16 +6873,13 @@ struct llm_build_context {
6331
6873
  struct ggml_tensor * cur;
6332
6874
  struct ggml_tensor * inpL;
6333
6875
 
6334
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6335
- cb(inpL, "inp_embd", -1);
6876
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6336
6877
 
6337
6878
  // inp_pos - contains the positions
6338
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6339
- cb(inp_pos, "inp_pos", -1);
6879
+ struct ggml_tensor * inp_pos = build_inp_pos();
6340
6880
 
6341
6881
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6342
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6343
- cb(KQ_mask, "KQ_mask", -1);
6882
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6344
6883
 
6345
6884
  for (int il = 0; il < n_layer; ++il) {
6346
6885
  struct ggml_tensor * inpSA = inpL;
@@ -6393,7 +6932,6 @@ struct llm_build_context {
6393
6932
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6394
6933
  model.layers[il].wo, NULL,
6395
6934
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6396
- cb(cur, "kqv_out", il);
6397
6935
  }
6398
6936
 
6399
6937
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -6449,16 +6987,13 @@ struct llm_build_context {
6449
6987
  struct ggml_tensor * cur;
6450
6988
  struct ggml_tensor * inpL;
6451
6989
 
6452
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6453
- cb(inpL, "inp_embd", -1);
6990
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6454
6991
 
6455
6992
  // inp_pos - contains the positions
6456
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6457
- cb(inp_pos, "inp_pos", -1);
6993
+ struct ggml_tensor * inp_pos = build_inp_pos();
6458
6994
 
6459
6995
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6460
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6461
- cb(KQ_mask, "KQ_mask", -1);
6996
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6462
6997
 
6463
6998
  for (int il = 0; il < n_layer; ++il) {
6464
6999
  struct ggml_tensor * inpSA = inpL;
@@ -6503,7 +7038,6 @@ struct llm_build_context {
6503
7038
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6504
7039
  model.layers[il].wo, NULL,
6505
7040
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6506
- cb(cur, "kqv_out", il);
6507
7041
  }
6508
7042
 
6509
7043
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -6558,16 +7092,13 @@ struct llm_build_context {
6558
7092
  struct ggml_tensor * cur;
6559
7093
  struct ggml_tensor * inpL;
6560
7094
 
6561
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6562
- cb(inpL, "inp_embd", -1);
7095
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6563
7096
 
6564
7097
  // inp_pos - contains the positions
6565
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6566
- cb(inp_pos, "inp_pos", -1);
7098
+ struct ggml_tensor * inp_pos = build_inp_pos();
6567
7099
 
6568
7100
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6569
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6570
- cb(KQ_mask, "KQ_mask", -1);
7101
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6571
7102
 
6572
7103
  for (int il = 0; il < n_layer; ++il) {
6573
7104
  struct ggml_tensor * inpSA = inpL;
@@ -6619,7 +7150,6 @@ struct llm_build_context {
6619
7150
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6620
7151
  model.layers[il].wo, model.layers[il].bo,
6621
7152
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6622
- cb(cur, "kqv_out", il);
6623
7153
  }
6624
7154
 
6625
7155
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -6674,16 +7204,13 @@ struct llm_build_context {
6674
7204
  struct ggml_tensor * ffn_output;
6675
7205
  struct ggml_tensor * inpL;
6676
7206
 
6677
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6678
- cb(inpL, "inp_embd", -1);
7207
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6679
7208
 
6680
7209
  // inp_pos - contains the positions
6681
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6682
- cb(inp_pos, "inp_pos", -1);
7210
+ struct ggml_tensor * inp_pos = build_inp_pos();
6683
7211
 
6684
7212
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6685
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6686
- cb(KQ_mask, "KQ_mask", -1);
7213
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6687
7214
 
6688
7215
  for (int il = 0; il < n_layer; ++il) {
6689
7216
  attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
@@ -6741,7 +7268,6 @@ struct llm_build_context {
6741
7268
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6742
7269
  model.layers[il].wo, model.layers[il].bo,
6743
7270
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6744
- cb(cur, "kqv_out", il);
6745
7271
  }
6746
7272
 
6747
7273
  // FF
@@ -6791,16 +7317,13 @@ struct llm_build_context {
6791
7317
  struct ggml_tensor * cur;
6792
7318
  struct ggml_tensor * inpL;
6793
7319
 
6794
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6795
- cb(inpL, "inp_embd", -1);
7320
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6796
7321
 
6797
7322
  // inp_pos - contains the positions
6798
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6799
- cb(inp_pos, "inp_pos", -1);
7323
+ struct ggml_tensor * inp_pos = build_inp_pos();
6800
7324
 
6801
7325
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6802
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6803
- cb(KQ_mask, "KQ_mask", -1);
7326
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6804
7327
 
6805
7328
  for (int il = 0; il < n_layer; ++il) {
6806
7329
 
@@ -6839,7 +7362,6 @@ struct llm_build_context {
6839
7362
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6840
7363
  model.layers[il].wo, NULL,
6841
7364
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6842
- cb(cur, "kqv_out", il);
6843
7365
  }
6844
7366
  struct ggml_tensor * sa_out = cur;
6845
7367
 
@@ -6893,16 +7415,13 @@ struct llm_build_context {
6893
7415
  struct ggml_tensor * pos;
6894
7416
  struct ggml_tensor * inpL;
6895
7417
 
6896
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6897
- cb(inpL, "inp_embd", -1);
7418
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6898
7419
 
6899
7420
  // inp_pos - contains the positions
6900
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6901
- cb(inp_pos, "inp_pos", -1);
7421
+ struct ggml_tensor * inp_pos = build_inp_pos();
6902
7422
 
6903
7423
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6904
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6905
- cb(KQ_mask, "KQ_mask", -1);
7424
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6906
7425
 
6907
7426
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
6908
7427
  cb(pos, "pos_embd", -1);
@@ -6938,7 +7457,6 @@ struct llm_build_context {
6938
7457
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6939
7458
  model.layers[il].wo, model.layers[il].bo,
6940
7459
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6941
- cb(cur, "kqv_out", il);
6942
7460
  }
6943
7461
 
6944
7462
  // add the input
@@ -6991,16 +7509,13 @@ struct llm_build_context {
6991
7509
  struct ggml_tensor * cur;
6992
7510
  struct ggml_tensor * inpL;
6993
7511
 
6994
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6995
- cb(inpL, "inp_embd", -1);
7512
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6996
7513
 
6997
7514
  // inp_pos - contains the positions
6998
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6999
- cb(inp_pos, "inp_pos", -1);
7515
+ struct ggml_tensor * inp_pos = build_inp_pos();
7000
7516
 
7001
7517
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7002
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7003
- cb(KQ_mask, "KQ_mask", -1);
7518
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7004
7519
 
7005
7520
  for (int il = 0; il < n_layer; ++il) {
7006
7521
  cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7042,7 +7557,6 @@ struct llm_build_context {
7042
7557
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7043
7558
  model.layers[il].wo, model.layers[il].bo,
7044
7559
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7045
- cb(cur, "kqv_out", il);
7046
7560
  }
7047
7561
 
7048
7562
  // add the input
@@ -7094,16 +7608,13 @@ struct llm_build_context {
7094
7608
  struct ggml_tensor * cur;
7095
7609
  struct ggml_tensor * inpL;
7096
7610
 
7097
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7098
- cb(inpL, "inp_embd", -1);
7611
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7099
7612
 
7100
7613
  // inp_pos - contains the positions
7101
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7102
- cb(inp_pos, "inp_pos", -1);
7614
+ struct ggml_tensor * inp_pos = build_inp_pos();
7103
7615
 
7104
7616
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7105
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7106
- cb(KQ_mask, "KQ_mask", -1);
7617
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7107
7618
 
7108
7619
  for (int il = 0; il < n_layer; ++il) {
7109
7620
  struct ggml_tensor * inpSA = inpL;
@@ -7155,7 +7666,6 @@ struct llm_build_context {
7155
7666
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7156
7667
  model.layers[il].wo, NULL,
7157
7668
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7158
- cb(cur, "kqv_out", il);
7159
7669
  }
7160
7670
 
7161
7671
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -7208,16 +7718,13 @@ struct llm_build_context {
7208
7718
  struct ggml_tensor * cur;
7209
7719
  struct ggml_tensor * inpL;
7210
7720
 
7211
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7212
- cb(inpL, "inp_embd", -1);
7721
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7213
7722
 
7214
7723
  // inp_pos - contains the positions
7215
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7216
- cb(inp_pos, "inp_pos", -1);
7724
+ struct ggml_tensor * inp_pos = build_inp_pos();
7217
7725
 
7218
7726
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7219
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7220
- cb(KQ_mask, "KQ_mask", -1);
7727
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7221
7728
 
7222
7729
  for (int il = 0; il < n_layer; ++il) {
7223
7730
  struct ggml_tensor * inpSA = inpL;
@@ -7269,7 +7776,6 @@ struct llm_build_context {
7269
7776
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7270
7777
  model.layers[il].wo, model.layers[il].bo,
7271
7778
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7272
- cb(cur, "kqv_out", il);
7273
7779
  }
7274
7780
 
7275
7781
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -7331,20 +7837,17 @@ struct llm_build_context {
7331
7837
  struct ggml_tensor * cur;
7332
7838
  struct ggml_tensor * inpL;
7333
7839
 
7334
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7335
- cb(inpL, "inp_embd", -1);
7840
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7336
7841
 
7337
7842
  // scale the input embeddings
7338
7843
  inpL = ggml_scale(ctx0, inpL, scale_embd);
7339
7844
  cb(inpL, "inp_scaled", -1);
7340
7845
 
7341
7846
  // inp_pos - contains the positions
7342
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7343
- cb(inp_pos, "inp_pos", -1);
7847
+ struct ggml_tensor * inp_pos = build_inp_pos();
7344
7848
 
7345
7849
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7346
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7347
- cb(KQ_mask, "KQ_mask", -1);
7850
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7348
7851
 
7349
7852
  for (int il = 0; il < n_layer; ++il) {
7350
7853
  struct ggml_tensor * inpSA = inpL;
@@ -7396,7 +7899,6 @@ struct llm_build_context {
7396
7899
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7397
7900
  model.layers[il].wo, model.layers[il].bo,
7398
7901
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7399
- cb(cur, "kqv_out", il);
7400
7902
  }
7401
7903
 
7402
7904
  // scale_res - scale the hidden states for residual connection
@@ -7463,22 +7965,18 @@ struct llm_build_context {
7463
7965
  struct ggml_tensor * cur;
7464
7966
  struct ggml_tensor * inpL;
7465
7967
 
7466
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7467
- cb(inpL, "inp_embd", -1);
7968
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7468
7969
 
7469
7970
  inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
7470
7971
  cb(inpL, "inp_scaled", -1);
7471
7972
 
7472
7973
  // inp_pos - contains the positions
7473
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7474
- cb(inp_pos, "inp_pos", -1);
7974
+ struct ggml_tensor * inp_pos = build_inp_pos();
7475
7975
 
7476
7976
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7477
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7478
- cb(KQ_mask, "KQ_mask", -1);
7977
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7479
7978
 
7480
7979
  for (int il = 0; il < n_layer; ++il) {
7481
-
7482
7980
  // norm
7483
7981
  cur = llm_build_norm(ctx0, inpL, hparams,
7484
7982
  model.layers[il].attn_norm, NULL,
@@ -7515,7 +8013,6 @@ struct llm_build_context {
7515
8013
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7516
8014
  model.layers[il].wo, NULL,
7517
8015
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7518
- cb(cur, "kqv_out", il);
7519
8016
  }
7520
8017
 
7521
8018
  struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
@@ -7559,6 +8056,255 @@ struct llm_build_context {
7559
8056
 
7560
8057
  return gf;
7561
8058
  }
8059
+
8060
+ struct ggml_cgraph * build_starcoder2() {
8061
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8062
+
8063
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8064
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8065
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8066
+
8067
+ struct ggml_tensor * cur;
8068
+ struct ggml_tensor * inpL;
8069
+
8070
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8071
+
8072
+ // inp_pos - contains the positions
8073
+ struct ggml_tensor * inp_pos = build_inp_pos();
8074
+
8075
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8076
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8077
+
8078
+ for (int il = 0; il < n_layer; ++il) {
8079
+ struct ggml_tensor * inpSA = inpL;
8080
+
8081
+ // norm
8082
+ cur = llm_build_norm(ctx0, inpL, hparams,
8083
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
8084
+ LLM_NORM, cb, il);
8085
+ cb(cur, "attn_norm", il);
8086
+
8087
+ // self-attention
8088
+ {
8089
+ // compute Q and K and RoPE them
8090
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8091
+ cb(Qcur, "Qcur", il);
8092
+ if (model.layers[il].bq) {
8093
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8094
+ cb(Qcur, "Qcur", il);
8095
+ }
8096
+
8097
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8098
+ cb(Kcur, "Kcur", il);
8099
+ if (model.layers[il].bk) {
8100
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8101
+ cb(Kcur, "Kcur", il);
8102
+ }
8103
+
8104
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8105
+ cb(Vcur, "Vcur", il);
8106
+ if (model.layers[il].bv) {
8107
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8108
+ cb(Vcur, "Vcur", il);
8109
+ }
8110
+
8111
+ Qcur = ggml_rope_custom(
8112
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8113
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8114
+ ext_factor, attn_factor, beta_fast, beta_slow
8115
+ );
8116
+ cb(Qcur, "Qcur", il);
8117
+
8118
+ Kcur = ggml_rope_custom(
8119
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8120
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8121
+ ext_factor, attn_factor, beta_fast, beta_slow
8122
+ );
8123
+ cb(Kcur, "Kcur", il);
8124
+
8125
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8126
+ model.layers[il].wo, model.layers[il].bo,
8127
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8128
+ cb(cur, "kqv_out", il);
8129
+ }
8130
+
8131
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8132
+ cb(ffn_inp, "ffn_inp", il);
8133
+
8134
+ // feed-forward network
8135
+
8136
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8137
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
8138
+ LLM_NORM, cb, il);
8139
+ cb(cur, "ffn_norm", il);
8140
+
8141
+ cur = llm_build_ffn(ctx0, cur,
8142
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8143
+ NULL, NULL,
8144
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8145
+ NULL,
8146
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8147
+ cb(cur, "ffn_out", il);
8148
+ cur = ggml_add(ctx0, cur, ffn_inp);
8149
+ cb(cur, "l_out", il);
8150
+
8151
+ // input for next layer
8152
+ inpL = cur;
8153
+ }
8154
+
8155
+ cur = inpL;
8156
+
8157
+ cur = llm_build_norm(ctx0, cur, hparams,
8158
+ model.output_norm, model.output_norm_b,
8159
+ LLM_NORM, cb, -1);
8160
+ cb(cur, "result_norm", -1);
8161
+
8162
+ // lm_head
8163
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8164
+ cb(cur, "result_output", -1);
8165
+
8166
+ ggml_build_forward_expand(gf, cur);
8167
+
8168
+ return gf;
8169
+ }
8170
+
8171
+ struct ggml_cgraph * build_mamba() {
8172
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8173
+
8174
+ const int64_t d_model = n_embd;
8175
+ const int64_t d_conv = hparams.ssm_d_conv;
8176
+ const int64_t d_inner = hparams.ssm_d_inner;
8177
+ GGML_ASSERT(2 * d_model == d_inner);
8178
+ const int64_t d_state = hparams.ssm_d_state;
8179
+ const int64_t dt_rank = hparams.ssm_dt_rank;
8180
+
8181
+ struct ggml_tensor * cur;
8182
+ struct ggml_tensor * inpL;
8183
+
8184
+ // {n_embd, n_tokens}
8185
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8186
+
8187
+ struct ggml_tensor * state_mask = build_inp_s_mask();
8188
+ struct ggml_tensor * state_seq = build_inp_s_seq();
8189
+
8190
+ for (int il = 0; il < n_layer; ++il) {
8191
+ // (ab)using the KV cache to store the states
8192
+ struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
8193
+ struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
8194
+
8195
+ // clear states of sequences which are starting at the beginning of this batch
8196
+ {
8197
+ conv_states = ggml_mul(ctx0,
8198
+ ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]),
8199
+ state_mask);
8200
+ ssm_states = ggml_mul(ctx0,
8201
+ ggml_view_2d(ctx0, ssm_states, ssm_states->ne[0], n_kv, ssm_states->nb[1], kv_head*ssm_states->nb[1]),
8202
+ state_mask);
8203
+ }
8204
+
8205
+ conv_states = ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv);
8206
+ ssm_states = ggml_reshape_3d(ctx0, ssm_states, d_state, d_inner, n_kv);
8207
+
8208
+ // norm
8209
+ cur = llm_build_norm(ctx0, inpL, hparams,
8210
+ model.layers[il].attn_norm, NULL,
8211
+ LLM_NORM_RMS, cb, il);
8212
+ cb(cur, "attn_norm", il);
8213
+
8214
+ // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
8215
+ struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
8216
+ // split the above in two
8217
+ // => {d_inner, n_tokens}
8218
+ struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
8219
+ struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
8220
+
8221
+ // conv
8222
+ {
8223
+ // Custom operator which is needed only to ease simultaneous sequence processing.
8224
+ // For a single sequence, the equivalent is to concatenate the columns of conv_states and x,
8225
+ // then make a self-overlapping view of that over d_conv columns at each stride in the 3rd dimension,
8226
+ // then element-wise multiply that with the conv1d weigth,
8227
+ // then sum the elements of each row,
8228
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
8229
+ // then permute away the ne[0] dimension,
8230
+ // and then you're left with the resulting x tensor.
8231
+ // The new conv_states is the last (d_conv - 1) columns
8232
+ // of the last 3rd dimensional "layer" of the self-overlapping view.
8233
+ // For simultaneous sequences, it's more complicated.
8234
+ struct ggml_tensor * x_conv = ggml_ssm_conv(ctx0, conv_states, x, model.layers[il].ssm_conv1d, state_seq);
8235
+
8236
+ // store last (d_conv - 1) columns of the conv_state part of x_conv back into the KV cache
8237
+ ggml_build_forward_expand(gf,
8238
+ ggml_cpy(ctx0,
8239
+ ggml_view_2d(ctx0, x_conv, d_conv - 1, d_inner*n_kv, d_conv*ggml_element_size(x_conv), (1+d_inner*n_tokens)*ggml_element_size(x_conv)),
8240
+ ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(x_conv))));
8241
+
8242
+ // extract x from x_conv
8243
+ x = ggml_view_2d(ctx0, x_conv, d_inner, n_tokens, d_inner*ggml_element_size(x_conv), 0);
8244
+
8245
+ // bias
8246
+ x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
8247
+
8248
+ x = ggml_silu(ctx0, x);
8249
+ }
8250
+
8251
+ // ssm
8252
+ {
8253
+ // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
8254
+ struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
8255
+ // split
8256
+ struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
8257
+ struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
8258
+ struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
8259
+
8260
+ // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
8261
+ dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
8262
+ dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
8263
+
8264
+ // Custom operator to optimize the parallel associative scan
8265
+ // as described in the Annex D of the Mamba paper.
8266
+ // => {d_inner, n_tokens} and {d_state, d_inner, n_kv} combined,
8267
+ // because only a single tensor can be returned.
8268
+ struct ggml_tensor * y_ssm_states = ggml_ssm_scan(ctx0, ssm_states, x, dt, model.layers[il].ssm_a, B, C, state_seq);
8269
+
8270
+ // store last states (the second part of y_ssm_states)
8271
+ ggml_build_forward_expand(gf,
8272
+ ggml_cpy(ctx0,
8273
+ ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*ggml_element_size(y_ssm_states)),
8274
+ ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_head*d_state*d_inner*ggml_element_size(ssm_states))));
8275
+
8276
+ struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
8277
+
8278
+ // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
8279
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
8280
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
8281
+
8282
+ // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
8283
+ cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
8284
+ }
8285
+
8286
+ // residual
8287
+ cur = ggml_add(ctx0, cur, inpL);
8288
+ cb(cur, "l_out", il);
8289
+
8290
+ // input for next layer
8291
+ inpL = cur;
8292
+ }
8293
+
8294
+ // final rmsnorm
8295
+ cur = llm_build_norm(ctx0, inpL, hparams,
8296
+ model.output_norm, NULL,
8297
+ LLM_NORM_RMS, cb, -1);
8298
+ cb(cur, "result_norm", -1);
8299
+
8300
+ // lm_head
8301
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8302
+ cb(cur, "result_output", -1);
8303
+
8304
+ ggml_build_forward_expand(gf, cur);
8305
+
8306
+ return gf;
8307
+ }
7562
8308
  };
7563
8309
 
7564
8310
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -7595,6 +8341,23 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
7595
8341
  return result;
7596
8342
  }
7597
8343
 
8344
+ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
8345
+ llama_batch dummy;
8346
+ dummy.n_tokens = 0;
8347
+
8348
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
8349
+
8350
+ struct llm_build_context llm(lctx, dummy, cb, false);
8351
+
8352
+ llm.init();
8353
+
8354
+ struct ggml_cgraph * result = llm.build_s_copy();
8355
+
8356
+ llm.free();
8357
+
8358
+ return result;
8359
+ }
8360
+
7598
8361
  static struct ggml_cgraph * llama_build_graph(
7599
8362
  llama_context & lctx,
7600
8363
  const llama_batch & batch,
@@ -7612,7 +8375,18 @@ static struct ggml_cgraph * llama_build_graph(
7612
8375
  if (!lctx.cparams.offload_kqv) {
7613
8376
  if (strcmp(name, "kqv_merged_cont") == 0) {
7614
8377
  // all nodes between the KV store and the attention output are run on the CPU
7615
- ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
8378
+ ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu);
8379
+ }
8380
+ }
8381
+
8382
+ // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
8383
+ // to fix this, we assign the norm layer manually to the backend of its layer
8384
+ if (il != -1 && strcmp(name, "norm") == 0) {
8385
+ for (auto * backend : lctx.backends) {
8386
+ if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
8387
+ ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
8388
+ break;
8389
+ }
7616
8390
  }
7617
8391
  }
7618
8392
  };
@@ -7705,6 +8479,14 @@ static struct ggml_cgraph * llama_build_graph(
7705
8479
  {
7706
8480
  result = llm.build_gemma();
7707
8481
  } break;
8482
+ case LLM_ARCH_STARCODER2:
8483
+ {
8484
+ result = llm.build_starcoder2();
8485
+ } break;
8486
+ case LLM_ARCH_MAMBA:
8487
+ {
8488
+ result = llm.build_mamba();
8489
+ } break;
7708
8490
  default:
7709
8491
  GGML_ASSERT(false);
7710
8492
  }
@@ -7715,19 +8497,29 @@ static struct ggml_cgraph * llama_build_graph(
7715
8497
  }
7716
8498
 
7717
8499
  static void llama_set_k_shift(llama_context & lctx) {
7718
- const auto & cparams = lctx.cparams;
7719
-
7720
- const int64_t n_ctx = cparams.n_ctx;
8500
+ const int64_t kv_size = lctx.kv_self.size;
7721
8501
 
7722
8502
  assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7723
8503
 
7724
8504
  int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7725
8505
 
7726
- for (int i = 0; i < n_ctx; ++i) {
8506
+ for (int i = 0; i < kv_size; ++i) {
7727
8507
  data[i] = lctx.kv_self.cells[i].delta;
7728
8508
  }
7729
8509
  }
7730
8510
 
8511
+ static void llama_set_s_copy(llama_context & lctx) {
8512
+ const int64_t kv_size = lctx.kv_self.size;
8513
+
8514
+ assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
8515
+
8516
+ int32_t * data = (int32_t *) lctx.inp_s_copy->data;
8517
+
8518
+ for (int i = 0; i < kv_size; ++i) {
8519
+ data[i] = lctx.kv_self.cells[i].src;
8520
+ }
8521
+ }
8522
+
7731
8523
  static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7732
8524
  //
7733
8525
  // set input data
@@ -7750,34 +8542,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7750
8542
  ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7751
8543
  }
7752
8544
 
7753
- if (batch.pos) {
8545
+ if (batch.pos && lctx.inp_pos) {
7754
8546
  const int64_t n_tokens = batch.n_tokens;
7755
8547
 
7756
8548
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7757
8549
  }
7758
8550
 
7759
- {
7760
- const int64_t n_kv = kv_self.n;
7761
- const int64_t n_tokens = batch.n_tokens;
8551
+ GGML_ASSERT(
8552
+ (hparams.causal_attn || !cparams.causal_attn) &&
8553
+ "non-causal attention with generative models is not supported"
8554
+ );
7762
8555
 
7763
- assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
8556
+ if (lctx.inp_KQ_mask) {
8557
+ // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
8558
+ if (cparams.causal_attn) {
8559
+ const int64_t n_kv = kv_self.n;
8560
+ const int64_t n_tokens = batch.n_tokens;
7764
8561
 
7765
- float * data = (float *) lctx.inp_KQ_mask->data;
8562
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7766
8563
 
7767
- for (int h = 0; h < 1; ++h) {
7768
- for (int j = 0; j < n_tokens; ++j) {
7769
- const llama_pos pos = batch.pos[j];
7770
- const llama_seq_id seq_id = batch.seq_id[j][0];
8564
+ float * data = (float *) lctx.inp_KQ_mask->data;
7771
8565
 
7772
- for (int i = 0; i < n_kv; ++i) {
7773
- float f;
7774
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7775
- (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
7776
- f = -INFINITY;
7777
- } else {
7778
- f = 0;
8566
+ // For causal attention, use only the previous KV cells
8567
+ // of the correct sequence for each token of the batch.
8568
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
8569
+ for (int h = 0; h < 1; ++h) {
8570
+ for (int j = 0; j < n_tokens; ++j) {
8571
+ const llama_pos pos = batch.pos[j];
8572
+ const llama_seq_id seq_id = batch.seq_id[j][0];
8573
+
8574
+ for (int i = 0; i < n_kv; ++i) {
8575
+ float f;
8576
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
8577
+ f = -INFINITY;
8578
+ } else {
8579
+ f = 0.0f;
8580
+ }
8581
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
8582
+ }
8583
+ }
8584
+ }
8585
+ } else {
8586
+ // when using kv cache, the mask needs to match the kv cache size
8587
+ const int64_t n_tokens = batch.n_tokens;
8588
+ const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens;
8589
+
8590
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
8591
+
8592
+ float * data = (float *) lctx.inp_KQ_mask->data;
8593
+
8594
+ for (int h = 0; h < 1; ++h) {
8595
+ for (int j = 0; j < n_tokens; ++j) {
8596
+ const llama_seq_id seq_id = batch.seq_id[j][0];
8597
+
8598
+ for (int i = 0; i < n_tokens; ++i) {
8599
+ float f = -INFINITY;
8600
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
8601
+ if (batch.seq_id[i][s] == seq_id) {
8602
+ f = 0.0f;
8603
+ break;
8604
+ }
8605
+ }
8606
+
8607
+ data[h*(n_tokens*n_tokens) + j*n_stride + i] = f;
8608
+ }
8609
+
8610
+ for (int i = n_tokens; i < n_stride; ++i) {
8611
+ data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY;
7779
8612
  }
7780
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7781
8613
  }
7782
8614
  }
7783
8615
  }
@@ -7786,7 +8618,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7786
8618
  if (hparams.need_kq_pos) {
7787
8619
  const int64_t n_kv = kv_self.n;
7788
8620
 
7789
- assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
8621
+ GGML_ASSERT(lctx.inp_KQ_pos);
8622
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
7790
8623
 
7791
8624
  float * data = (float *) lctx.inp_KQ_pos->data;
7792
8625
 
@@ -7795,17 +8628,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7795
8628
  }
7796
8629
  }
7797
8630
 
7798
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
8631
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7799
8632
  const int64_t n_tokens = batch.n_tokens;
7800
8633
 
8634
+ GGML_ASSERT(lctx.inp_mean);
7801
8635
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7802
- float * data = (float *) lctx.inp_mean->data;
7803
8636
 
8637
+ float * data = (float *) lctx.inp_mean->data;
7804
8638
  memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7805
8639
 
7806
8640
  std::vector<uint64_t> sum(n_tokens, 0);
7807
8641
  for (int i = 0; i < n_tokens; ++i) {
7808
8642
  const llama_seq_id seq_id = batch.seq_id[i][0];
8643
+
8644
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
8645
+
7809
8646
  sum[seq_id] += 1;
7810
8647
  }
7811
8648
 
@@ -7823,20 +8660,73 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7823
8660
  }
7824
8661
  }
7825
8662
 
7826
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
8663
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7827
8664
  const int64_t n_tokens = batch.n_tokens;
7828
8665
 
8666
+ GGML_ASSERT(lctx.inp_cls);
7829
8667
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
8668
+
7830
8669
  uint32_t * data = (uint32_t *) lctx.inp_cls->data;
8670
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
7831
8671
 
7832
8672
  for (int i = 0; i < n_tokens; ++i) {
7833
8673
  const llama_seq_id seq_id = batch.seq_id[i][0];
7834
- const llama_pos pos = batch.pos[i];
8674
+ const llama_pos pos = batch.pos[i];
8675
+
8676
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
8677
+
7835
8678
  if (pos == 0) {
7836
8679
  data[seq_id] = i;
7837
8680
  }
7838
8681
  }
7839
8682
  }
8683
+
8684
+ if (kv_self.recurrent) {
8685
+ const int64_t n_kv = kv_self.n;
8686
+
8687
+ if (lctx.inp_s_mask) {
8688
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
8689
+ float * data = (float *) lctx.inp_s_mask->data;
8690
+
8691
+ // states which are not affected by the current batch are left untouched
8692
+ for (int i = 0; i < n_kv; ++i) {
8693
+ llama_seq_id seq_id = i + lctx.kv_self.head;
8694
+ llama_kv_cell & kv_cell = lctx.kv_self.cells[seq_id];
8695
+ bool has_self_seq = kv_cell.has_seq_id(seq_id);
8696
+
8697
+ data[i] = (float) has_self_seq;
8698
+
8699
+ // ensure current sequences will be kept
8700
+ if (!has_self_seq && kv_cell.pos >= 0) {
8701
+ kv_cell.seq_id.insert(seq_id);
8702
+ }
8703
+ }
8704
+ }
8705
+ // For Mamba (and other recurrent architectures),
8706
+ // update the correct state(s)/sequence(s) for each token of the batch.
8707
+ // Like with the KQ_mask, if a token in the batch has multiple sequences,
8708
+ // they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv).
8709
+ if (lctx.inp_s_seq) {
8710
+ const int64_t n_tokens = batch.n_tokens;
8711
+
8712
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer));
8713
+ int32_t * data = (int32_t *) lctx.inp_s_seq->data;
8714
+
8715
+ for (int j = 0; j < n_tokens; ++j) {
8716
+ const int32_t n_seq = batch.n_seq_id[j];
8717
+ GGML_ASSERT(0 < n_seq); // a token should be part of at least 1 sequence
8718
+
8719
+ for (int i = 0; i < n_kv; ++i) {
8720
+ if (i < n_seq) {
8721
+ // for this type of model, the head is the minimum seq_id of the batch
8722
+ data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head;
8723
+ } else {
8724
+ data[j*n_kv + i] = -1;
8725
+ }
8726
+ }
8727
+ }
8728
+ }
8729
+ }
7840
8730
  }
7841
8731
 
7842
8732
  static void llama_graph_compute(
@@ -7856,9 +8746,10 @@ static void llama_graph_compute(
7856
8746
 
7857
8747
  if (lctx.backend_cpu != nullptr) {
7858
8748
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
8749
+ ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
7859
8750
  }
7860
8751
 
7861
- ggml_backend_sched_graph_compute(lctx.sched, gf);
8752
+ ggml_backend_sched_graph_compute_async(lctx.sched, gf);
7862
8753
 
7863
8754
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7864
8755
 
@@ -7878,10 +8769,11 @@ static void llama_graph_compute(
7878
8769
  //
7879
8770
  static int llama_decode_internal(
7880
8771
  llama_context & lctx,
7881
- llama_batch batch) {
7882
- const uint32_t n_tokens = batch.n_tokens;
8772
+ llama_batch batch_all) { // TODO: rename back to batch
8773
+
8774
+ const uint32_t n_tokens_all = batch_all.n_tokens;
7883
8775
 
7884
- if (n_tokens == 0) {
8776
+ if (n_tokens_all == 0) {
7885
8777
  LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
7886
8778
  return -1;
7887
8779
  }
@@ -7890,14 +8782,16 @@ static int llama_decode_internal(
7890
8782
  const auto & hparams = model.hparams;
7891
8783
  const auto & cparams = lctx.cparams;
7892
8784
 
7893
- const auto n_batch = cparams.n_batch;
8785
+ GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
7894
8786
 
7895
- GGML_ASSERT(n_tokens <= n_batch);
7896
- GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
8787
+ GGML_ASSERT(n_tokens_all <= cparams.n_batch);
7897
8788
 
7898
- int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8789
+ GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
7899
8790
 
7900
- const int64_t t_start_us = ggml_time_us();
8791
+ if (lctx.t_compute_start_us == 0) {
8792
+ lctx.t_compute_start_us = ggml_time_us();
8793
+ }
8794
+ lctx.n_queued_tokens += n_tokens_all;
7901
8795
 
7902
8796
  #ifdef GGML_USE_MPI
7903
8797
  // TODO: needs fix after #3228
@@ -7905,213 +8799,274 @@ static int llama_decode_internal(
7905
8799
  //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
7906
8800
  #endif
7907
8801
 
7908
- GGML_ASSERT(n_threads > 0);
7909
-
7910
8802
  auto & kv_self = lctx.kv_self;
7911
8803
 
7912
8804
  const int64_t n_embd = hparams.n_embd;
7913
8805
  const int64_t n_vocab = hparams.n_vocab;
7914
8806
 
7915
- // helpers for smoother batch API transition
7916
- // after deprecating the llama_eval calls, these will be removed
7917
- std::vector<llama_pos> pos;
7918
8807
 
7919
- std::vector<int32_t> n_seq_id;
7920
- std::vector<llama_seq_id *> seq_id_arr;
7921
- std::vector<std::vector<llama_seq_id>> seq_id;
8808
+ auto * logits_out = lctx.logits;
7922
8809
 
7923
- if (batch.pos == nullptr) {
7924
- pos.resize(n_tokens);
7925
- for (uint32_t i = 0; i < n_tokens; i++) {
7926
- pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
7927
- }
8810
+ #ifndef NDEBUG
8811
+ auto & logits_valid = lctx.logits_valid;
8812
+ logits_valid.clear();
8813
+ logits_valid.resize(n_tokens_all);
7928
8814
 
7929
- batch.pos = pos.data();
7930
- }
8815
+ memset(logits_out, 0, lctx.logits_size*sizeof(float));
8816
+ #endif
7931
8817
 
7932
- if (batch.seq_id == nullptr) {
7933
- n_seq_id.resize(n_tokens);
7934
- seq_id.resize(n_tokens);
7935
- seq_id_arr.resize(n_tokens);
7936
- for (uint32_t i = 0; i < n_tokens; i++) {
7937
- n_seq_id[i] = 1;
7938
- seq_id[i].resize(1);
7939
- seq_id[i][0] = batch.all_seq_id;
7940
- seq_id_arr[i] = seq_id[i].data();
7941
- }
8818
+ const auto n_ubatch = cparams.n_ubatch;
7942
8819
 
7943
- batch.n_seq_id = n_seq_id.data();
7944
- batch.seq_id = seq_id_arr.data();
7945
- }
8820
+ std::vector<llama_pos> pos;
8821
+ std::vector<int32_t> n_seq_id;
8822
+ std::vector<llama_seq_id *> seq_id_arr;
8823
+ std::vector<std::vector<llama_seq_id>> seq_id;
7946
8824
 
7947
- llama_kv_cache_update(&lctx);
8825
+ for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
8826
+ const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
8827
+ llama_batch u_batch = {
8828
+ /* .n_tokens = */ (int32_t) n_tokens,
8829
+ /* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr,
8830
+ /* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr,
8831
+ /* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr,
8832
+ /* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr,
8833
+ /* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr,
8834
+ /* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr,
8835
+ /* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
8836
+ /* .all_pos_1 = */ batch_all.all_pos_1,
8837
+ /* .all_seq_id = */ batch_all.all_seq_id,
8838
+ };
7948
8839
 
7949
- // if we have enough unused cells before the current head ->
7950
- // better to start searching from the beginning of the cache, hoping to fill it
7951
- if (kv_self.head > kv_self.used + 2*n_tokens) {
7952
- kv_self.head = 0;
7953
- }
8840
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8841
+ GGML_ASSERT(n_threads > 0);
7954
8842
 
7955
- if (!llama_kv_cache_find_slot(kv_self, batch)) {
7956
- return 1;
7957
- }
8843
+ // helpers for smoother batch API transition
8844
+ // after deprecating the llama_eval calls, these will be removed
8845
+ if (u_batch.pos == nullptr) {
8846
+ pos.resize(n_tokens);
8847
+ for (uint32_t i = 0; i < n_tokens; i++) {
8848
+ pos[i] = u_batch.all_pos_0 + i*u_batch.all_pos_1;
8849
+ }
7958
8850
 
7959
- // a heuristic, to avoid attending the full cache if it is not yet utilized
7960
- // after enough generations, the benefit from this heuristic disappears
7961
- // if we start defragmenting the cache, the benefit from this will be more important
7962
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
7963
- //kv_self.n = llama_kv_cache_cell_max(kv_self);
8851
+ u_batch.pos = pos.data();
8852
+ }
7964
8853
 
7965
- //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
8854
+ if (u_batch.seq_id == nullptr) {
8855
+ n_seq_id.resize(n_tokens);
8856
+ seq_id.resize(n_tokens);
8857
+ seq_id_arr.resize(n_tokens);
8858
+ for (uint32_t i = 0; i < n_tokens; i++) {
8859
+ n_seq_id[i] = 1;
8860
+ seq_id[i].resize(1);
8861
+ seq_id[i][0] = u_batch.all_seq_id;
8862
+ seq_id_arr[i] = seq_id[i].data();
8863
+ }
7966
8864
 
7967
- ggml_backend_sched_reset(lctx.sched);
7968
- ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
8865
+ u_batch.n_seq_id = n_seq_id.data();
8866
+ u_batch.seq_id = seq_id_arr.data();
8867
+ }
7969
8868
 
7970
- ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
8869
+ // non-causal masks do not use the KV cache
8870
+ if (hparams.causal_attn) {
8871
+ llama_kv_cache_update(&lctx);
7971
8872
 
7972
- // the output is always the last tensor in the graph
7973
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7974
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
8873
+ // if we have enough unused cells before the current head ->
8874
+ // better to start searching from the beginning of the cache, hoping to fill it
8875
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
8876
+ kv_self.head = 0;
8877
+ }
8878
+
8879
+ if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
8880
+ return 1;
8881
+ }
7975
8882
 
7976
- if (strcmp(res->name, "result_output") == 0) {
7977
- // the embeddings could be the second to last tensor, or the third to last tensor
7978
- if (strcmp(embeddings->name, "result_norm") != 0) {
7979
- embeddings = gf->nodes[gf->n_nodes - 3];
7980
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
8883
+ if (!kv_self.recurrent) {
8884
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
8885
+ // after enough generations, the benefit from this heuristic disappears
8886
+ // if we start defragmenting the cache, the benefit from this will be more important
8887
+ kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
8888
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
8889
+ }
7981
8890
  }
7982
- } else if (strcmp(res->name, "result_embd") == 0) {
7983
- embeddings = res;
7984
- res = nullptr;
7985
- } else {
7986
- GGML_ASSERT(false);
7987
- }
7988
8891
 
7989
- // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
8892
+ //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
7990
8893
 
7991
- // for big prompts, if BLAS is enabled, it is better to use only one thread
7992
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
7993
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
7994
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
7995
- // with the BLAS calls. need a better solution
7996
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
7997
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
7998
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7999
- n_threads = std::min(4, n_threads);
8000
- }
8894
+ ggml_backend_sched_reset(lctx.sched);
8895
+ ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
8001
8896
 
8002
- llama_set_inputs(lctx, batch);
8897
+ ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false);
8003
8898
 
8004
- llama_graph_compute(lctx, gf, n_threads);
8899
+ // the output is always the last tensor in the graph
8900
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
8901
+ struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
8005
8902
 
8006
- // update the kv ring buffer
8007
- {
8008
- kv_self.head += n_tokens;
8903
+ if (!hparams.causal_attn) {
8904
+ res = nullptr; // do not extract logits for embedding models such as BERT
8009
8905
 
8010
- // Ensure kv cache head points to a valid index.
8011
- if (kv_self.head >= kv_self.size) {
8012
- kv_self.head = 0;
8013
- }
8014
- }
8015
-
8016
- // decide if we need to defrag the kv cache
8017
- if (cparams.defrag_thold >= 0.0f) {
8018
- const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
8906
+ // token or sequence embeddings
8907
+ embd = gf->nodes[gf->n_nodes - 1];
8019
8908
 
8020
- // queue defragmentation for next llama_kv_cache_update
8021
- if (fragmentation > cparams.defrag_thold) {
8022
- //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8909
+ GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
8910
+ } else {
8911
+ if (strcmp(res->name, "result_output") == 0) {
8912
+ // the token embeddings could be the second to last tensor, or the third to last tensor
8913
+ if (strcmp(embd->name, "result_norm") != 0) {
8914
+ embd = gf->nodes[gf->n_nodes - 3];
8915
+ GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
8916
+ }
8917
+ } else {
8918
+ GGML_ASSERT(false && "missing result_output tensor");
8919
+ }
8920
+ }
8921
+ // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
8023
8922
 
8024
- llama_kv_cache_defrag(kv_self);
8923
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
8924
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
8925
+ // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
8926
+ // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
8927
+ // with the BLAS calls. need a better solution
8928
+ // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
8929
+ // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
8930
+ if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
8931
+ n_threads = std::min(4, n_threads);
8025
8932
  }
8026
- }
8027
8933
 
8028
- #ifdef GGML_PERF
8029
- // print timing information per ggml operation (for debugging purposes)
8030
- // requires GGML_PERF to be defined
8031
- ggml_graph_print(gf);
8032
- #endif
8934
+ ggml_backend_sched_alloc_graph(lctx.sched, gf);
8033
8935
 
8034
- // plot the computation graph in dot format (for debugging purposes)
8035
- //if (n_past%100 == 0) {
8036
- // ggml_graph_dump_dot(gf, NULL, "llama.dot");
8037
- //}
8936
+ llama_set_inputs(lctx, u_batch);
8038
8937
 
8039
- // extract logits
8040
- // TODO: do not compute and extract logits if only embeddings are needed
8041
- // need to update the graphs to skip "result_output"
8042
- if (res) {
8043
- auto & logits_out = lctx.logits;
8938
+ llama_graph_compute(lctx, gf, n_threads);
8044
8939
 
8045
- #ifndef NDEBUG
8046
- auto & logits_valid = lctx.logits_valid;
8047
- logits_valid.clear();
8048
- logits_valid.resize(n_tokens);
8940
+ // update the kv ring buffer
8941
+ {
8942
+ kv_self.head += n_tokens;
8943
+
8944
+ // Ensure kv cache head points to a valid index.
8945
+ if (kv_self.head >= kv_self.size) {
8946
+ kv_self.head = 0;
8947
+ }
8948
+ }
8049
8949
 
8050
- logits_out.clear();
8950
+ #ifdef GGML_PERF
8951
+ // print timing information per ggml operation (for debugging purposes)
8952
+ // requires GGML_PERF to be defined
8953
+ ggml_graph_print(gf);
8051
8954
  #endif
8052
8955
 
8053
- ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
8054
- GGML_ASSERT(res_backend != nullptr);
8055
- if (batch.logits) {
8056
- logits_out.resize(n_vocab * n_tokens);
8057
- for (uint32_t i = 0; i < n_tokens; i++) {
8058
- if (batch.logits[i] == 0) {
8059
- continue;
8060
- }
8061
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8956
+ // plot the computation graph in dot format (for debugging purposes)
8957
+ //if (n_past%100 == 0) {
8958
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
8959
+ //}
8960
+
8961
+ // extract logits
8962
+ // TODO: do not compute and extract logits if only embeddings are needed
8963
+ // update the graphs to skip "result_output" if logits are not needed
8964
+ if (res) {
8965
+ ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
8966
+ GGML_ASSERT(backend_res != nullptr);
8967
+ if (u_batch.logits) {
8968
+ int32_t i_first = -1;
8969
+ for (uint32_t i = 0; i < n_tokens; i++) {
8970
+ if (u_batch.logits[i] && i_first == -1) {
8971
+ i_first = (int32_t) i;
8972
+ }
8973
+ if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
8974
+ if (i_first != -1) {
8975
+ int i_last = u_batch.logits[i] == 0 ? i : i + 1;
8976
+ // extract logits for the range [i_first, i_last)
8977
+ // group the requests to minimize the number of calls to the backend
8978
+ ggml_backend_tensor_get_async(backend_res, res,
8979
+ logits_out + n_vocab*(cur_token + i_first),
8980
+ i_first*n_vocab*sizeof(float),
8981
+ (i_last - i_first)*n_vocab*sizeof(float));
8982
+ i_first = -1;
8983
+ }
8984
+ }
8062
8985
  #ifndef NDEBUG
8063
- logits_valid[i] = true;
8986
+ logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
8064
8987
  #endif
8065
- }
8066
- } else if (lctx.logits_all) {
8067
- logits_out.resize(n_vocab * n_tokens);
8068
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8988
+ }
8989
+ } else if (lctx.logits_all) {
8990
+ ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
8069
8991
  #ifndef NDEBUG
8070
- std::fill(logits_valid.begin(), logits_valid.end(), true);
8992
+ std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
8071
8993
  #endif
8072
- } else {
8073
- logits_out.resize(n_vocab);
8074
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8994
+ } else {
8995
+ if (cur_token + n_tokens >= n_tokens_all) {
8996
+ ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
8075
8997
  #ifndef NDEBUG
8076
- logits_valid[0] = true;
8998
+ logits_valid[0] = true;
8077
8999
  #endif
9000
+ }
9001
+ }
8078
9002
  }
8079
- ggml_backend_synchronize(res_backend);
8080
- }
8081
9003
 
8082
- // extract embeddings
8083
- if (!lctx.embedding.empty()) {
8084
- auto & embedding_out = lctx.embedding;
9004
+ // extract embeddings
9005
+ if (cparams.embeddings && embd) {
9006
+ ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
9007
+ GGML_ASSERT(backend_embd != nullptr);
8085
9008
 
8086
- const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
8087
- const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
9009
+ switch (cparams.pooling_type) {
9010
+ case LLAMA_POOLING_TYPE_NONE:
9011
+ {
9012
+ // extract token embeddings
9013
+ auto & embd_out = lctx.embd;
9014
+
9015
+ if (u_batch.logits) {
9016
+ //embd_out.resize(n_embd * n_tokens);
9017
+ for (uint32_t i = 0; i < n_tokens; i++) {
9018
+ if (u_batch.logits[i] == 0) {
9019
+ continue;
9020
+ }
9021
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
9022
+ }
9023
+ }
9024
+ } break;
9025
+ case LLAMA_POOLING_TYPE_CLS:
9026
+ case LLAMA_POOLING_TYPE_MEAN:
9027
+ {
9028
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
8088
9029
 
8089
- embedding_out.resize(embd_size);
8090
- ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8091
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8092
- ggml_backend_synchronize(embeddings_backend);
8093
- }
9030
+ // extract sequence embeddings
9031
+ auto & embd_seq_out = lctx.embd_seq;
9032
+ embd_seq_out.clear();
8094
9033
 
8095
- // measure the performance only for the single-token evals
8096
- if (n_tokens == 1) {
8097
- lctx.t_eval_us += ggml_time_us() - t_start_us;
8098
- lctx.n_eval++;
8099
- }
8100
- else if (n_tokens > 1) {
8101
- lctx.t_p_eval_us += ggml_time_us() - t_start_us;
8102
- lctx.n_p_eval += n_tokens;
9034
+ for (uint32_t i = 0; i < n_tokens; i++) {
9035
+ const llama_seq_id seq_id = u_batch.seq_id[i][0];
9036
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
9037
+ continue;
9038
+ }
9039
+ embd_seq_out[seq_id].resize(n_embd);
9040
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
9041
+ }
9042
+ } break;
9043
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
9044
+ {
9045
+ GGML_ASSERT(false && "unknown pooling type");
9046
+ } break;
9047
+ }
9048
+ }
8103
9049
  }
8104
9050
 
8105
- // get a more accurate load time, upon first eval
8106
- // TODO: fix this
8107
- if (!lctx.has_evaluated_once) {
8108
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
8109
- lctx.has_evaluated_once = true;
9051
+ // wait for the computation to finish (automatically done when obtaining the model output)
9052
+ //llama_synchronize(&lctx);
9053
+
9054
+ // decide if we need to defrag the kv cache
9055
+ if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
9056
+ const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
9057
+
9058
+ // queue defragmentation for next llama_kv_cache_update
9059
+ if (fragmentation > cparams.defrag_thold) {
9060
+ //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
9061
+
9062
+ llama_kv_cache_defrag(kv_self);
9063
+ }
8110
9064
  }
8111
9065
 
8112
9066
  return 0;
8113
9067
  }
8114
9068
 
9069
+
8115
9070
  // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
8116
9071
  static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8117
9072
  auto & kv_self = lctx.kv_self;
@@ -8130,6 +9085,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8130
9085
  // number of cells moved
8131
9086
  uint32_t n_moves = 0;
8132
9087
 
9088
+ // each move requires 6*n_layer tensors (see build_defrag)
9089
+ // - source view, destination view, copy operation
9090
+ // - x2 for keys and values
9091
+ const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
9092
+
8133
9093
  // determine which KV cells to move where
8134
9094
  //
8135
9095
  // cell i moves to ids[i]
@@ -8156,15 +9116,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8156
9116
  nh++;
8157
9117
  }
8158
9118
 
8159
- // each move requires 6*n_layer tensors (see build_defrag)
8160
- // - source view, destination view, copy operation
8161
- // - x2 for keys and values
8162
- //
8163
- if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
8164
- // the graph is too big, we cannot move more cells
8165
- break;
8166
- }
8167
-
8168
9119
  uint32_t nf = 0;
8169
9120
  uint32_t is = n_kv - 1;
8170
9121
 
@@ -8194,11 +9145,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8194
9145
  // are we moving a continuous block of memory?
8195
9146
  bool cont = false;
8196
9147
 
9148
+ // should we stop searching for the next move?
9149
+ bool stop = false;
9150
+
8197
9151
  // go back and move the nf cells to the hole
8198
9152
  for (; i1 < n_kv; ++i1) {
8199
9153
  auto & cell1 = kv_self.cells[i1];
8200
9154
 
8201
9155
  if (cell1.is_empty() || ids[i1] != n_kv) {
9156
+ if (n_moves == max_moves) {
9157
+ stop = true;
9158
+ break;
9159
+ }
9160
+
8202
9161
  cont = false;
8203
9162
  continue;
8204
9163
  }
@@ -8225,6 +9184,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8225
9184
  }
8226
9185
  }
8227
9186
 
9187
+ if (stop || n_moves == max_moves) {
9188
+ break;
9189
+ }
9190
+
8228
9191
  //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
8229
9192
 
8230
9193
  i0 += nh - 1;
@@ -8311,6 +9274,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8311
9274
  #else
8312
9275
  // ggml_graph defrag
8313
9276
 
9277
+ ggml_backend_sched_reset(lctx.sched);
9278
+
8314
9279
  ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8315
9280
 
8316
9281
  llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
@@ -8322,14 +9287,22 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8322
9287
  }
8323
9288
 
8324
9289
  static void llama_kv_cache_update_internal(struct llama_context & lctx) {
9290
+ bool need_reserve = false;
9291
+
8325
9292
  // apply K-shift if needed
8326
9293
  if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
8327
- llama_set_k_shift(lctx);
8328
-
8329
9294
  {
9295
+ ggml_backend_sched_reset(lctx.sched);
9296
+
8330
9297
  ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
8331
9298
 
9299
+ ggml_backend_sched_alloc_graph(lctx.sched, gf);
9300
+
9301
+ llama_set_k_shift(lctx);
9302
+
8332
9303
  llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
9304
+
9305
+ need_reserve = true;
8333
9306
  }
8334
9307
 
8335
9308
  {
@@ -8343,12 +9316,56 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
8343
9316
  }
8344
9317
  }
8345
9318
 
9319
+ if (lctx.kv_self.recurrent && lctx.kv_self.do_copy) {
9320
+ {
9321
+ ggml_backend_sched_reset(lctx.sched);
9322
+
9323
+ ggml_cgraph * gf = llama_build_graph_s_copy(lctx);
9324
+
9325
+ ggml_backend_sched_alloc_graph(lctx.sched, gf);
9326
+
9327
+ llama_set_s_copy(lctx);
9328
+
9329
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
9330
+
9331
+ need_reserve = true;
9332
+ }
9333
+
9334
+ {
9335
+ auto & kv_self = lctx.kv_self;
9336
+
9337
+ kv_self.do_copy = false;
9338
+
9339
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
9340
+ kv_self.cells[i].src = i;
9341
+ }
9342
+ }
9343
+ }
9344
+
8346
9345
  // defragment the KV cache if needed
8347
9346
  if (lctx.kv_self.do_defrag) {
8348
9347
  llama_kv_cache_defrag_internal(lctx);
8349
9348
 
9349
+ need_reserve = true;
9350
+
8350
9351
  lctx.kv_self.do_defrag = false;
8351
9352
  }
9353
+
9354
+ // reserve a worst case graph again
9355
+ if (need_reserve) {
9356
+ // TODO: extract to a function
9357
+ // build worst-case graph
9358
+ int n_tokens = (int)std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
9359
+ int n_past = lctx.cparams.n_ctx - n_tokens;
9360
+ llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
9361
+ ggml_cgraph * gf = llama_build_graph(lctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
9362
+
9363
+ // initialize scheduler with the worst-case graph
9364
+ ggml_backend_sched_reset(lctx.sched);
9365
+ if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
9366
+ LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
9367
+ }
9368
+ }
8352
9369
  }
8353
9370
 
8354
9371
  //
@@ -8360,46 +9377,53 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
8360
9377
  }
8361
9378
 
8362
9379
  static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
9380
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8363
9381
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
8364
9382
  }
8365
9383
 
8366
9384
  static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
9385
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8367
9386
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
8368
9387
  }
8369
9388
 
8370
9389
  static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
9390
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8371
9391
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
8372
9392
  }
8373
9393
 
8374
9394
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
9395
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8375
9396
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
8376
9397
  }
8377
9398
 
8378
9399
  static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
9400
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8379
9401
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
8380
9402
  }
8381
9403
 
8382
9404
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
9405
+ GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
8383
9406
  GGML_ASSERT(llama_is_byte_token(vocab, id));
8384
9407
  const auto& token_data = vocab.id_to_token.at(id);
8385
9408
  switch (llama_vocab_get_type(vocab)) {
8386
- case LLAMA_VOCAB_TYPE_SPM: {
8387
- auto buf = token_data.text.substr(3, 2);
8388
- return strtol(buf.c_str(), NULL, 16);
8389
- }
8390
- case LLAMA_VOCAB_TYPE_BPE: {
8391
- GGML_ASSERT(false);
8392
- return unicode_to_bytes_bpe(token_data.text);
8393
- }
8394
- case LLAMA_VOCAB_TYPE_WPM: {
8395
- GGML_ASSERT(false);
8396
- }
8397
- default:
8398
- GGML_ASSERT(false);
9409
+ case LLAMA_VOCAB_TYPE_SPM: {
9410
+ auto buf = token_data.text.substr(3, 2);
9411
+ return strtol(buf.c_str(), NULL, 16);
9412
+ }
9413
+ case LLAMA_VOCAB_TYPE_BPE: {
9414
+ GGML_ASSERT(false);
9415
+ return unicode_utf8_to_byte(token_data.text);
9416
+ }
9417
+ case LLAMA_VOCAB_TYPE_WPM: {
9418
+ GGML_ASSERT(false);
9419
+ }
9420
+ default:
9421
+ GGML_ASSERT(false);
8399
9422
  }
8400
9423
  }
8401
9424
 
8402
9425
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
9426
+ GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
8403
9427
  static const char * hex = "0123456789ABCDEF";
8404
9428
  switch (llama_vocab_get_type(vocab)) {
8405
9429
  case LLAMA_VOCAB_TYPE_SPM: {
@@ -8414,7 +9438,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
8414
9438
  }
8415
9439
  case LLAMA_VOCAB_TYPE_WPM:
8416
9440
  case LLAMA_VOCAB_TYPE_BPE: {
8417
- return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
9441
+ return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
8418
9442
  }
8419
9443
  default:
8420
9444
  GGML_ASSERT(false);
@@ -8754,9 +9778,9 @@ private:
8754
9778
  bpe_words.reserve(text.size());
8755
9779
  bpe_encoded_words.reserve(text.size());
8756
9780
 
8757
- auto cps = codepoints_from_utf8(text);
8758
- for (size_t i = 0; i < cps.size(); ++i)
8759
- text_utf.emplace_back(codepoint_to_utf8(cps[i]));
9781
+ const auto cpts = unicode_cpts_from_utf8(text);
9782
+ for (size_t i = 0; i < cpts.size(); ++i)
9783
+ text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
8760
9784
 
8761
9785
  for (int i = 0; i < (int)text_utf.size(); i++) {
8762
9786
  const std::string & utf_char = text_utf[i];
@@ -8806,40 +9830,40 @@ private:
8806
9830
  }
8807
9831
 
8808
9832
  if (!split_condition && !collecting) {
8809
- if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
9833
+ if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
8810
9834
  collecting_letter = true;
8811
9835
  collecting = true;
8812
9836
  }
8813
- else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
9837
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
8814
9838
  collecting_numeric = true;
8815
9839
  collecting = true;
8816
9840
  }
8817
9841
  else if (
8818
- ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
8819
- (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
9842
+ ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
9843
+ (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
8820
9844
  ) {
8821
9845
  collecting_special = true;
8822
9846
  collecting = true;
8823
9847
  }
8824
- else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
9848
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
8825
9849
  collecting_whitespace_lookahead = true;
8826
9850
  collecting = true;
8827
9851
  }
8828
- else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
9852
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
8829
9853
  split_condition = true;
8830
9854
  }
8831
9855
  }
8832
9856
  else if (!split_condition && collecting) {
8833
- if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
9857
+ if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
8834
9858
  split_condition = true;
8835
9859
  }
8836
- else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
9860
+ else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
8837
9861
  split_condition = true;
8838
9862
  }
8839
- else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
9863
+ else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
8840
9864
  split_condition = true;
8841
9865
  }
8842
- else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
9866
+ else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
8843
9867
  split_condition = true;
8844
9868
  }
8845
9869
  }
@@ -8868,7 +9892,7 @@ private:
8868
9892
  for (std::string & word : bpe_words) {
8869
9893
  std::string encoded_token = "";
8870
9894
  for (char & c : word) {
8871
- encoded_token += bytes_to_unicode_bpe(c);
9895
+ encoded_token += unicode_byte_to_utf8(c);
8872
9896
  }
8873
9897
  bpe_encoded_words.emplace_back(encoded_token);
8874
9898
  }
@@ -8942,25 +9966,13 @@ struct llm_tokenizer_wpm {
8942
9966
  }
8943
9967
 
8944
9968
  std::vector<std::string> preprocess(const std::string & text) {
8945
- // normalalization form D
8946
- std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
8947
- std::vector<uint32_t> nfd_codepoints;
8948
- for (uint32_t code : codepoints) {
8949
- auto it = nfd_map.equal_range(code);
8950
- if (it.first != it.second) {
8951
- for (auto jt = it.first; jt != it.second; jt++) {
8952
- nfd_codepoints.push_back(jt->second);
8953
- }
8954
- } else {
8955
- nfd_codepoints.push_back(code);
8956
- }
8957
- }
9969
+ std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
8958
9970
 
8959
9971
  // strip accents, strip control, uniformize whitespace,
8960
9972
  // to lowercase, pad chinese characters, pad punctuation
8961
9973
  std::string new_str = "";
8962
- for (uint32_t code : nfd_codepoints) {
8963
- int type = codepoint_type(code);
9974
+ for (uint32_t code : cpts_nfd) {
9975
+ int type = unicode_cpt_type(code);
8964
9976
  if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
8965
9977
  continue;
8966
9978
  }
@@ -8968,7 +9980,7 @@ struct llm_tokenizer_wpm {
8968
9980
  if (type == CODEPOINT_TYPE_WHITESPACE) {
8969
9981
  code = ' ';
8970
9982
  }
8971
- std::string s = codepoint_to_utf8(code);
9983
+ std::string s = unicode_cpt_to_utf8(code);
8972
9984
  if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
8973
9985
  new_str += " ";
8974
9986
  new_str += s;
@@ -8988,8 +10000,7 @@ struct llm_tokenizer_wpm {
8988
10000
  if (r > l) words.push_back(new_str.substr(l, (r - l)));
8989
10001
  l = r + 1;
8990
10002
  r = l;
8991
- }
8992
- else {
10003
+ } else {
8993
10004
  r += 1;
8994
10005
  }
8995
10006
  }
@@ -9013,17 +10024,17 @@ struct llm_tokenizer_wpm {
9013
10024
  return code < 256 && ispunct(code);
9014
10025
  }
9015
10026
 
9016
- bool is_chinese_char(uint32_t codepoint) {
9017
- if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
9018
- (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
9019
- (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
9020
- (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
9021
- (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
9022
- (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
9023
- (codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
9024
- (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
9025
- (codepoint >= 0x3000 && codepoint <= 0x303F) ||
9026
- (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
10027
+ bool is_chinese_char(uint32_t cpt) {
10028
+ if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
10029
+ (cpt >= 0x3400 && cpt <= 0x4DBF) ||
10030
+ (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
10031
+ (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
10032
+ (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
10033
+ (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
10034
+ (cpt >= 0xF900 && cpt <= 0xFAFF) ||
10035
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
10036
+ (cpt >= 0x3000 && cpt <= 0x303F) ||
10037
+ (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
9027
10038
  return true; // NOLINT
9028
10039
  }
9029
10040
  return false;
@@ -9244,6 +10255,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
9244
10255
  }
9245
10256
  }
9246
10257
  } break;
10258
+ case LLAMA_VOCAB_TYPE_NONE:
10259
+ GGML_ASSERT(false);
9247
10260
  }
9248
10261
 
9249
10262
  return output;
@@ -9600,7 +10613,7 @@ struct llama_grammar * llama_grammar_init(
9600
10613
 
9601
10614
  // loop over alternates of start rule to build initial stacks
9602
10615
  std::vector<std::vector<const llama_grammar_element *>> stacks;
9603
- pos = rules[start_rule_index];
10616
+ pos = vec_rules[start_rule_index].data();
9604
10617
  do {
9605
10618
  std::vector<const llama_grammar_element *> stack;
9606
10619
  if (!llama_grammar_is_end_of_sequence(pos)) {
@@ -10615,13 +11628,16 @@ struct quantize_state_internal {
10615
11628
 
10616
11629
  bool has_imatrix = false;
10617
11630
 
11631
+ // used to figure out if a model shares tok_embd with the output weight
11632
+ bool has_output = false;
11633
+
10618
11634
  quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
10619
11635
  : model(model)
10620
11636
  , params(params)
10621
11637
  {}
10622
11638
  };
10623
11639
 
10624
- static void llama_convert_tensor_internal(
11640
+ static void llama_tensor_dequantize_internal(
10625
11641
  struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
10626
11642
  const size_t nelements, const int nthread
10627
11643
  ) {
@@ -10682,7 +11698,7 @@ static void llama_convert_tensor_internal(
10682
11698
  workers.clear();
10683
11699
  }
10684
11700
 
10685
- static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
11701
+ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
10686
11702
  const std::string name = ggml_get_name(tensor);
10687
11703
 
10688
11704
  // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -10712,8 +11728,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10712
11728
 
10713
11729
  // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
10714
11730
  // with the quantization of the output tensor
10715
- if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
10716
- (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
11731
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
10717
11732
  int nx = tensor->ne[0];
10718
11733
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10719
11734
  new_type = GGML_TYPE_Q8_0;
@@ -10962,41 +11977,76 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10962
11977
  return new_type;
10963
11978
  }
10964
11979
 
11980
+ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
11981
+ std::mutex mutex;
11982
+ int counter = 0;
11983
+ size_t new_size = 0;
11984
+ if (nthread < 2) {
11985
+ // single-thread
11986
+ return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
11987
+ }
11988
+ auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
11989
+ nrows, n_per_row, imatrix]() {
11990
+ const int nrows_per_chunk = chunk_size / n_per_row;
11991
+ size_t local_size = 0;
11992
+ while (true) {
11993
+ std::unique_lock<std::mutex> lock(mutex);
11994
+ int first_row = counter; counter += nrows_per_chunk;
11995
+ if (first_row >= nrows) {
11996
+ if (local_size > 0) {
11997
+ new_size += local_size;
11998
+ }
11999
+ break;
12000
+ }
12001
+ lock.unlock();
12002
+ const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
12003
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
12004
+ }
12005
+ };
12006
+ for (int it = 0; it < nthread - 1; ++it) {
12007
+ workers.emplace_back(compute);
12008
+ }
12009
+ compute();
12010
+ for (auto & w : workers) { w.join(); }
12011
+ workers.clear();
12012
+ return new_size;
12013
+ }
12014
+
10965
12015
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
10966
- ggml_type quantized_type;
12016
+ ggml_type default_type;
10967
12017
  llama_ftype ftype = params->ftype;
10968
12018
 
10969
12019
  switch (params->ftype) {
10970
- case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
10971
- case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
10972
- case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
10973
- case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
10974
- case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
10975
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
10976
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
12020
+ case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
12021
+ case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
12022
+ case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
12023
+ case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
12024
+ case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
12025
+ case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
12026
+ case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
10977
12027
 
10978
12028
  // K-quants
10979
12029
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10980
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10981
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
12030
+ case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
12031
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
10982
12032
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10983
12033
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10984
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
12034
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
10985
12035
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
10986
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
12036
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
10987
12037
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
10988
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
10989
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10990
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10991
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10992
- case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
10993
- case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
10994
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10995
- case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10996
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10997
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
10998
- case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
10999
- case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
12038
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
12039
+ case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
12040
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
12041
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
12042
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
12043
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
12044
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
12045
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
12046
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
12047
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
12048
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
12049
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
11000
12050
 
11001
12051
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
11002
12052
  }
@@ -11062,6 +12112,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11062
12112
  else if (name.find("ffn_up") != std::string::npos) {
11063
12113
  ++qs.n_ffn_up;
11064
12114
  }
12115
+ else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
12116
+ qs.has_output = true;
12117
+ }
11065
12118
  }
11066
12119
  if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
11067
12120
  LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
@@ -11070,11 +12123,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11070
12123
 
11071
12124
  size_t total_size_org = 0;
11072
12125
  size_t total_size_new = 0;
11073
- std::vector<int64_t> hist_all(1 << 4, 0);
11074
12126
 
11075
12127
  std::vector<std::thread> workers;
11076
12128
  workers.reserve(nthread);
11077
- std::mutex mutex;
11078
12129
 
11079
12130
  int idx = 0;
11080
12131
 
@@ -11133,20 +12184,29 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11133
12184
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
11134
12185
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
11135
12186
 
12187
+ // do not quantize Mamba's small yet 2D weights
12188
+ // NOTE: can't use LLM_TN here because the layer number is not known
12189
+ quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
12190
+ quantize &= name.find("ssm_x.weight") == std::string::npos;
12191
+ quantize &= name.find("ssm_dt.weight") == std::string::npos;
12192
+
11136
12193
  enum ggml_type new_type;
11137
12194
  void * new_data;
11138
12195
  size_t new_size;
11139
12196
 
11140
12197
  if (quantize) {
11141
- new_type = quantized_type;
11142
- if (!params->pure) {
11143
- new_type = get_k_quant_type(qs, new_type, tensor, ftype);
12198
+ new_type = default_type;
12199
+
12200
+ // get more optimal quantization type based on the tensor shape, layer, etc.
12201
+ if (!params->pure && ggml_is_quantized(default_type)) {
12202
+ new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
11144
12203
  }
11145
12204
 
11146
12205
  // If we've decided to quantize to the same type the tensor is already
11147
12206
  // in then there's nothing to do.
11148
12207
  quantize = tensor->type != new_type;
11149
12208
  }
12209
+
11150
12210
  if (!quantize) {
11151
12211
  new_type = tensor->type;
11152
12212
  new_data = tensor->data;
@@ -11188,18 +12248,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11188
12248
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
11189
12249
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
11190
12250
  } else {
11191
- llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
12251
+ llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
11192
12252
  f32_data = (float *) f32_conv_buf.data();
11193
12253
  }
11194
12254
 
11195
- LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
12255
+ LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
11196
12256
  fflush(stdout);
11197
12257
 
11198
12258
  if (work.size() < nelements * 4) {
11199
12259
  work.resize(nelements * 4); // upper bound on size
11200
12260
  }
11201
12261
  new_data = work.data();
11202
- std::array<int64_t, 1 << 4> hist_cur = {};
11203
12262
 
11204
12263
  const int n_per_row = tensor->ne[0];
11205
12264
  const int nrows = nelements / n_per_row;
@@ -11209,56 +12268,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11209
12268
 
11210
12269
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
11211
12270
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
11212
- if (nthread_use < 2) {
11213
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
11214
- } else {
11215
- int counter = 0;
11216
- new_size = 0;
11217
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
11218
- nrows, n_per_row, imatrix]() {
11219
- std::array<int64_t, 1 << 4> local_hist = {};
11220
- const int nrows_per_chunk = chunk_size / n_per_row;
11221
- size_t local_size = 0;
11222
- while (true) {
11223
- std::unique_lock<std::mutex> lock(mutex);
11224
- int first_row = counter; counter += nrows_per_chunk;
11225
- if (first_row >= nrows) {
11226
- if (local_size > 0) {
11227
- for (int j=0; j<int(local_hist.size()); ++j) {
11228
- hist_cur[j] += local_hist[j];
11229
- }
11230
- new_size += local_size;
11231
- }
11232
- break;
11233
- }
11234
- lock.unlock();
11235
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
11236
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
11237
- first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
11238
- }
11239
- };
11240
- for (int it = 0; it < nthread_use - 1; ++it) {
11241
- workers.emplace_back(compute);
11242
- }
11243
- compute();
11244
- for (auto & w : workers) { w.join(); }
11245
- workers.clear();
11246
- }
11247
-
11248
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
11249
- int64_t tot_count = 0;
11250
- for (size_t i = 0; i < hist_cur.size(); i++) {
11251
- hist_all[i] += hist_cur[i];
11252
- tot_count += hist_cur[i];
11253
- }
12271
+ new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
11254
12272
 
11255
- if (tot_count > 0) {
11256
- LLAMA_LOG_INFO(" | hist: ");
11257
- for (size_t i = 0; i < hist_cur.size(); i++) {
11258
- LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
11259
- }
11260
- }
11261
- LLAMA_LOG_INFO("\n");
12273
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
11262
12274
  }
11263
12275
  total_size_org += ggml_nbytes(tensor);
11264
12276
  total_size_new += new_size;
@@ -11287,24 +12299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11287
12299
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
11288
12300
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
11289
12301
 
11290
- // print histogram for all tensors
11291
- {
11292
- int64_t sum_all = 0;
11293
- for (size_t i = 0; i < hist_all.size(); i++) {
11294
- sum_all += hist_all[i];
11295
- }
11296
-
11297
- if (sum_all > 0) {
11298
- LLAMA_LOG_INFO("%s: hist: ", __func__);
11299
- for (size_t i = 0; i < hist_all.size(); i++) {
11300
- LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
11301
- }
11302
- LLAMA_LOG_INFO("\n");
11303
- }
11304
- }
11305
-
11306
12302
  if (qs.n_fallback > 0) {
11307
- LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
12303
+ LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
11308
12304
  __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
11309
12305
  }
11310
12306
  }
@@ -11616,10 +12612,13 @@ struct llama_context_params llama_context_default_params() {
11616
12612
  struct llama_context_params result = {
11617
12613
  /*.seed =*/ LLAMA_DEFAULT_SEED,
11618
12614
  /*.n_ctx =*/ 512,
11619
- /*.n_batch =*/ 512,
12615
+ /*.n_batch =*/ 2048,
12616
+ /*.n_ubatch =*/ 512,
12617
+ /*.n_seq_max =*/ 1,
11620
12618
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11621
12619
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11622
12620
  /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
12621
+ /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
11623
12622
  /*.rope_freq_base =*/ 0.0f,
11624
12623
  /*.rope_freq_scale =*/ 0.0f,
11625
12624
  /*.yarn_ext_factor =*/ -1.0f,
@@ -11633,9 +12632,10 @@ struct llama_context_params llama_context_default_params() {
11633
12632
  /*.type_k =*/ GGML_TYPE_F16,
11634
12633
  /*.type_v =*/ GGML_TYPE_F16,
11635
12634
  /*.logits_all =*/ false,
11636
- /*.embedding =*/ false,
12635
+ /*.embeddings =*/ false,
11637
12636
  /*.offload_kqv =*/ true,
11638
- /*.do_pooling =*/ true,
12637
+ /*.abort_callback =*/ nullptr,
12638
+ /*.abort_callback_data =*/ nullptr,
11639
12639
  };
11640
12640
 
11641
12641
  return result;
@@ -11767,6 +12767,17 @@ struct llama_context * llama_new_context_with_model(
11767
12767
  struct llama_context_params params) {
11768
12768
 
11769
12769
  if (!model) {
12770
+ LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
12771
+ return nullptr;
12772
+ }
12773
+
12774
+ if (params.n_batch == 0 && params.n_ubatch == 0) {
12775
+ LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
12776
+ return nullptr;
12777
+ }
12778
+
12779
+ if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
12780
+ LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
11770
12781
  return nullptr;
11771
12782
  }
11772
12783
 
@@ -11775,7 +12786,7 @@ struct llama_context * llama_new_context_with_model(
11775
12786
  const auto & hparams = model->hparams;
11776
12787
  auto & cparams = ctx->cparams;
11777
12788
 
11778
- cparams.n_batch = params.n_batch;
12789
+ // TODO: maybe add n_seq_max here too
11779
12790
  cparams.n_threads = params.n_threads;
11780
12791
  cparams.n_threads_batch = params.n_threads_batch;
11781
12792
  cparams.yarn_ext_factor = params.yarn_ext_factor;
@@ -11783,13 +12794,19 @@ struct llama_context * llama_new_context_with_model(
11783
12794
  cparams.yarn_beta_fast = params.yarn_beta_fast;
11784
12795
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11785
12796
  cparams.defrag_thold = params.defrag_thold;
12797
+ cparams.embeddings = params.embeddings;
11786
12798
  cparams.offload_kqv = params.offload_kqv;
11787
- cparams.do_pooling = params.do_pooling;
12799
+ cparams.pooling_type = params.pooling_type;
11788
12800
 
11789
12801
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
11790
12802
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
11791
12803
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
11792
12804
 
12805
+ // with causal attention, the batch size is limited by the context size
12806
+ cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
12807
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
12808
+
12809
+
11793
12810
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
11794
12811
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
11795
12812
  hparams.n_ctx_train;
@@ -11810,19 +12827,44 @@ struct llama_context * llama_new_context_with_model(
11810
12827
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
11811
12828
  }
11812
12829
 
12830
+ cparams.causal_attn = hparams.causal_attn;
12831
+
12832
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12833
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12834
+ cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
12835
+ } else {
12836
+ cparams.pooling_type = hparams.pooling_type;
12837
+ }
12838
+ }
12839
+
11813
12840
  if (params.seed == LLAMA_DEFAULT_SEED) {
11814
12841
  params.seed = time(NULL);
11815
12842
  }
11816
12843
 
11817
12844
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
12845
+ LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
12846
+ LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
11818
12847
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
11819
12848
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
11820
12849
 
11821
- ctx->rng = std::mt19937(params.seed);
11822
- ctx->logits_all = params.logits_all;
12850
+ ctx->abort_callback = params.abort_callback;
12851
+ ctx->abort_callback_data = params.abort_callback_data;
12852
+
12853
+ ctx->rng = std::mt19937(params.seed);
12854
+ ctx->logits_all = params.logits_all;
11823
12855
 
11824
- const ggml_type type_k = params.type_k;
11825
- const ggml_type type_v = params.type_v;
12856
+ uint32_t kv_size = cparams.n_ctx;
12857
+ ggml_type type_k = params.type_k;
12858
+ ggml_type type_v = params.type_v;
12859
+
12860
+ // Mamba only needs a constant number of KV cache cells per sequence
12861
+ if (model->arch == LLM_ARCH_MAMBA) {
12862
+ // Mamba needs at least as many KV cells as there are sequences kept at any time
12863
+ kv_size = std::max((uint32_t) 1, params.n_seq_max);
12864
+ // it's probably best to keep as much precision as possible for the states
12865
+ type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
12866
+ type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
12867
+ }
11826
12868
 
11827
12869
  GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
11828
12870
  GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
@@ -11877,13 +12919,31 @@ struct llama_context * llama_new_context_with_model(
11877
12919
  }
11878
12920
  #elif defined(GGML_USE_SYCL)
11879
12921
  if (model->n_gpu_layers > 0) {
11880
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
11881
- if (backend == nullptr) {
11882
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
11883
- llama_free(ctx);
11884
- return nullptr;
12922
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12923
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12924
+ int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
12925
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
12926
+ if (backend == nullptr) {
12927
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
12928
+ llama_free(ctx);
12929
+ return nullptr;
12930
+ }
12931
+ ctx->backends.push_back(backend);
12932
+ } else {
12933
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
12934
+ int id_list[GGML_SYCL_MAX_DEVICES];
12935
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12936
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12937
+ int device_id = id_list[i];
12938
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
12939
+ if (backend == nullptr) {
12940
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
12941
+ llama_free(ctx);
12942
+ return nullptr;
12943
+ }
12944
+ ctx->backends.push_back(backend);
12945
+ }
11885
12946
  }
11886
- ctx->backends.push_back(backend);
11887
12947
  }
11888
12948
  #elif defined(GGML_USE_KOMPUTE)
11889
12949
  if (model->n_gpu_layers > 0) {
@@ -11904,7 +12964,7 @@ struct llama_context * llama_new_context_with_model(
11904
12964
  }
11905
12965
  ctx->backends.push_back(ctx->backend_cpu);
11906
12966
 
11907
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
12967
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
11908
12968
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
11909
12969
  llama_free(ctx);
11910
12970
  return nullptr;
@@ -11928,45 +12988,31 @@ struct llama_context * llama_new_context_with_model(
11928
12988
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
11929
12989
  }
11930
12990
 
11931
- // resized during inference, reserve maximum
11932
- ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
11933
-
11934
- if (params.embedding) {
11935
- ctx->embedding.resize(hparams.n_embd);
11936
- }
11937
-
11938
- // graph inputs
12991
+ // graph outputs buffer
11939
12992
  {
11940
- ggml_init_params init_params = {
11941
- /* .mem_size */ ggml_tensor_overhead()*8,
11942
- /* .mem_buffer */ nullptr,
11943
- /* .no_alloc */ true,
11944
- };
11945
- ctx->ctx_input = ggml_init(init_params);
12993
+ // resized during inference, reserve maximum
12994
+ ctx->logits_size = hparams.n_vocab*cparams.n_batch;
12995
+ ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
11946
12996
 
11947
- ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11948
- ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
11949
- ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11950
- ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
11951
- ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
11952
- ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11953
- ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11954
- ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
12997
+ const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
12998
+
12999
+ ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
13000
+ if (ctx->buf_output == nullptr) {
13001
+ LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
13002
+ llama_free(ctx);
13003
+ return nullptr;
13004
+ }
13005
+ ggml_backend_buffer_clear(ctx->buf_output, 0);
11955
13006
 
11956
- ggml_set_name(ctx->inp_tokens, "inp_tokens");
11957
- ggml_set_name(ctx->inp_embd, "inp_embd");
11958
- ggml_set_name(ctx->inp_pos, "inp_pos");
11959
- ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
11960
- ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
11961
- ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11962
- ggml_set_name(ctx->inp_mean, "inp_mean");
11963
- ggml_set_name(ctx->inp_cls, "inp_cls");
11964
13007
 
11965
- ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
13008
+ ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
13009
+ if (params.embeddings) {
13010
+ ctx->embd = ctx->logits + ctx->logits_size;
13011
+ }
11966
13012
 
11967
- LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
11968
- ggml_backend_buffer_name(ctx->buf_input),
11969
- ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
13013
+ LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
13014
+ ggml_backend_buffer_name(ctx->buf_output),
13015
+ ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
11970
13016
  }
11971
13017
 
11972
13018
  // scheduler and compute buffers
@@ -11985,10 +13031,21 @@ struct llama_context * llama_new_context_with_model(
11985
13031
  // buffer used to store the computation graph and the tensor meta data
11986
13032
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
11987
13033
 
11988
- ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
13034
+ // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
13035
+ bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
13036
+ #ifndef GGML_USE_CUBLAS
13037
+ // pipeline parallelism requires support for async compute and events
13038
+ // currently this is only implemented in the CUDA backend
13039
+ pipeline_parallel = false;
13040
+ #endif
13041
+ ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
13042
+
13043
+ if (pipeline_parallel) {
13044
+ LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
13045
+ }
11989
13046
 
11990
13047
  // build worst-case graph
11991
- int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
13048
+ int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
11992
13049
  int n_past = cparams.n_ctx - n_tokens;
11993
13050
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
11994
13051
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
@@ -12011,7 +13068,7 @@ struct llama_context * llama_new_context_with_model(
12011
13068
 
12012
13069
  // note: the number of splits during measure is higher than during inference due to the kv shift
12013
13070
  int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
12014
- LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
13071
+ LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
12015
13072
  }
12016
13073
  }
12017
13074
 
@@ -12048,6 +13105,14 @@ uint32_t llama_n_batch(const struct llama_context * ctx) {
12048
13105
  return ctx->cparams.n_batch;
12049
13106
  }
12050
13107
 
13108
+ uint32_t llama_n_ubatch(const struct llama_context * ctx) {
13109
+ return ctx->cparams.n_ubatch;
13110
+ }
13111
+
13112
+ uint32_t llama_n_seq_max(const struct llama_context * ctx) {
13113
+ return ctx->kv_self.size;
13114
+ }
13115
+
12051
13116
  enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
12052
13117
  return model->vocab.type;
12053
13118
  }
@@ -12061,6 +13126,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12061
13126
  case LLM_ARCH_MPT:
12062
13127
  case LLM_ARCH_REFACT:
12063
13128
  case LLM_ARCH_BLOOM:
13129
+ case LLM_ARCH_MAMBA:
12064
13130
  return LLAMA_ROPE_TYPE_NONE;
12065
13131
 
12066
13132
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -12084,6 +13150,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12084
13150
  case LLM_ARCH_QWEN2:
12085
13151
  case LLM_ARCH_PHI2:
12086
13152
  case LLM_ARCH_GEMMA:
13153
+ case LLM_ARCH_STARCODER2:
12087
13154
  return LLAMA_ROPE_TYPE_NEOX;
12088
13155
 
12089
13156
  // all model arches should be listed explicitly here
@@ -12096,7 +13163,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12096
13163
  }
12097
13164
 
12098
13165
  int32_t llama_n_vocab(const struct llama_model * model) {
12099
- return model->vocab.id_to_token.size();
13166
+ return model->hparams.n_vocab;
12100
13167
  }
12101
13168
 
12102
13169
  int32_t llama_n_ctx_train(const struct llama_model * model) {
@@ -12206,10 +13273,10 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
12206
13273
  }
12207
13274
  }
12208
13275
 
12209
- struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
13276
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
12210
13277
  struct llama_kv_cache_view result = {
12211
13278
  /*.n_cells = */ 0,
12212
- /*.n_max_seq = */ n_max_seq,
13279
+ /*.n_seq_max = */ n_seq_max,
12213
13280
  /*.token_count = */ 0,
12214
13281
  /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
12215
13282
  /*.max_contiguous = */ 0,
@@ -12237,7 +13304,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
12237
13304
  void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
12238
13305
  GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
12239
13306
  view->cells = (struct llama_kv_cache_view_cell *)p;
12240
- p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
13307
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
12241
13308
  GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
12242
13309
  view->cells_sequences = (llama_seq_id *)p;
12243
13310
  }
@@ -12251,7 +13318,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
12251
13318
  uint32_t max_contig = 0;
12252
13319
  int32_t max_contig_idx = -1;
12253
13320
 
12254
- for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
13321
+ for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) {
12255
13322
  const size_t curr_size = kv_cells[i].seq_id.size();
12256
13323
  token_count += curr_size;
12257
13324
  c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
@@ -12268,7 +13335,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
12268
13335
 
12269
13336
  int seq_idx = 0;
12270
13337
  for (const llama_seq_id it : kv_cells[i].seq_id) {
12271
- if (seq_idx >= view->n_max_seq) {
13338
+ if (seq_idx >= view->n_seq_max) {
12272
13339
  break;
12273
13340
  }
12274
13341
  cs_curr[seq_idx] = it;
@@ -12277,7 +13344,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
12277
13344
  if (seq_idx != 0) {
12278
13345
  used_cells++;
12279
13346
  }
12280
- for (; seq_idx < view->n_max_seq; seq_idx++) {
13347
+ for (; seq_idx < view->n_seq_max; seq_idx++) {
12281
13348
  cs_curr[seq_idx] = -1;
12282
13349
  }
12283
13350
  }
@@ -12313,8 +13380,8 @@ void llama_kv_cache_clear(struct llama_context * ctx) {
12313
13380
  llama_kv_cache_clear(ctx->kv_self);
12314
13381
  }
12315
13382
 
12316
- void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
12317
- llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
13383
+ bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
13384
+ return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
12318
13385
  }
12319
13386
 
12320
13387
  void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
@@ -12365,12 +13432,17 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12365
13432
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
12366
13433
  const size_t s_logits_size = sizeof(size_t);
12367
13434
  // assume worst case for logits although only currently set ones are serialized
12368
- const size_t s_logits = ctx->logits.capacity() * sizeof(float);
13435
+ const size_t s_logits = ctx->logits_size * sizeof(float);
12369
13436
  const size_t s_embedding_size = sizeof(size_t);
12370
- const size_t s_embedding = ctx->embedding.size() * sizeof(float);
12371
- const size_t s_kv_size = sizeof(size_t);
12372
- const size_t s_kv_ntok = sizeof(int);
13437
+ const size_t s_embedding = ctx->embd_size * sizeof(float);
13438
+ const size_t s_kv_buf_size = sizeof(size_t);
13439
+ const size_t s_kv_head = sizeof(uint32_t);
13440
+ const size_t s_kv_size = sizeof(uint32_t);
13441
+ const size_t s_kv_used = sizeof(uint32_t);
12373
13442
  const size_t s_kv = ctx->kv_self.total_size();
13443
+ // TODO: assume the max is more than 1 seq_id per KV cell
13444
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
13445
+ const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
12374
13446
 
12375
13447
  const size_t s_total = (
12376
13448
  + s_rng_size
@@ -12379,9 +13451,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12379
13451
  + s_logits
12380
13452
  + s_embedding_size
12381
13453
  + s_embedding
13454
+ + s_kv_buf_size
13455
+ + s_kv_head
12382
13456
  + s_kv_size
12383
- + s_kv_ntok
13457
+ + s_kv_used
12384
13458
  + s_kv
13459
+ + s_kv_cells
12385
13460
  );
12386
13461
 
12387
13462
  return s_total;
@@ -12457,23 +13532,23 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12457
13532
 
12458
13533
  // copy logits
12459
13534
  {
12460
- const size_t logits_size = ctx->logits.size();
13535
+ const size_t logits_size = ctx->logits_size;
12461
13536
 
12462
13537
  data_ctx->write(&logits_size, sizeof(logits_size));
12463
13538
 
12464
13539
  if (logits_size) {
12465
- data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
13540
+ data_ctx->write(ctx->logits, logits_size * sizeof(float));
12466
13541
  }
12467
13542
  }
12468
13543
 
12469
13544
  // copy embeddings
12470
13545
  {
12471
- const size_t embedding_size = ctx->embedding.size();
13546
+ const size_t embeddings_size = ctx->embd_size;
12472
13547
 
12473
- data_ctx->write(&embedding_size, sizeof(embedding_size));
13548
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
12474
13549
 
12475
- if (embedding_size) {
12476
- data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
13550
+ if (embeddings_size) {
13551
+ data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
12477
13552
  }
12478
13553
  }
12479
13554
 
@@ -12481,15 +13556,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12481
13556
  {
12482
13557
  const auto & kv_self = ctx->kv_self;
12483
13558
  const auto & hparams = ctx->model.hparams;
12484
- const auto & cparams = ctx->cparams;
12485
13559
 
12486
13560
  const uint32_t n_layer = hparams.n_layer;
12487
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12488
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12489
- const uint32_t n_ctx = cparams.n_ctx;
13561
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13562
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
12490
13563
 
12491
13564
  const size_t kv_buf_size = kv_self.total_size();
12492
- const uint32_t kv_head = kv_self.head;
13565
+ const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
12493
13566
  const uint32_t kv_size = kv_self.size;
12494
13567
  const uint32_t kv_used = kv_self.used;
12495
13568
 
@@ -12507,9 +13580,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12507
13580
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12508
13581
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12509
13582
 
13583
+ if (kv_self.recurrent) {
13584
+ // v is contiguous for recurrent models
13585
+ // TODO: use other tensors for state models than k and v
13586
+ const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
13587
+
13588
+ tmp_buf.resize(v_size);
13589
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size());
13590
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
13591
+ continue;
13592
+ }
13593
+
12510
13594
  // v is not contiguous, copy row by row
12511
13595
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12512
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
13596
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
12513
13597
 
12514
13598
  tmp_buf.resize(v_row_size);
12515
13599
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
@@ -12519,7 +13603,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12519
13603
  }
12520
13604
  }
12521
13605
 
12522
- for (uint32_t i = 0; i < kv_size; ++i) {
13606
+ for (uint32_t i = 0; i < kv_head; ++i) {
12523
13607
  const auto & cell = kv_self.cells[i];
12524
13608
 
12525
13609
  const llama_pos pos = cell.pos;
@@ -12567,27 +13651,25 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12567
13651
 
12568
13652
  memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
12569
13653
 
12570
- GGML_ASSERT(ctx->logits.capacity() >= logits_size);
13654
+ GGML_ASSERT(ctx->logits_size >= logits_size);
12571
13655
 
12572
13656
  if (logits_size) {
12573
- ctx->logits.resize(logits_size);
12574
-
12575
- memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
13657
+ memcpy(ctx->logits, inp, logits_size * sizeof(float));
12576
13658
  inp += logits_size * sizeof(float);
12577
13659
  }
12578
13660
  }
12579
13661
 
12580
13662
  // set embeddings
12581
13663
  {
12582
- size_t embedding_size;
13664
+ size_t embeddings_size;
12583
13665
 
12584
- memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
13666
+ memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
12585
13667
 
12586
- GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
13668
+ GGML_ASSERT(ctx->embd_size == embeddings_size);
12587
13669
 
12588
- if (embedding_size) {
12589
- memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
12590
- inp += embedding_size * sizeof(float);
13670
+ if (embeddings_size) {
13671
+ memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
13672
+ inp += embeddings_size * sizeof(float);
12591
13673
  }
12592
13674
  }
12593
13675
 
@@ -12595,12 +13677,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12595
13677
  {
12596
13678
  const auto & kv_self = ctx->kv_self;
12597
13679
  const auto & hparams = ctx->model.hparams;
12598
- const auto & cparams = ctx->cparams;
12599
13680
 
12600
13681
  const uint32_t n_layer = hparams.n_layer;
12601
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12602
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12603
- const uint32_t n_ctx = cparams.n_ctx;
13682
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13683
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
12604
13684
 
12605
13685
  size_t kv_buf_size;
12606
13686
  uint32_t kv_head;
@@ -12621,9 +13701,19 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12621
13701
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12622
13702
  inp += k_size;
12623
13703
 
13704
+ if (kv_self.recurrent) {
13705
+ // v is contiguous for recurrent models
13706
+ // TODO: use other tensors for state models than k and v
13707
+ const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
13708
+
13709
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size);
13710
+ inp += v_size;
13711
+ continue;
13712
+ }
13713
+
12624
13714
  // v is not contiguous, copy row by row
12625
13715
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12626
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
13716
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
12627
13717
 
12628
13718
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12629
13719
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
@@ -12632,13 +13722,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12632
13722
  }
12633
13723
  }
12634
13724
 
13725
+ GGML_ASSERT(kv_self.size == kv_size);
13726
+
12635
13727
  ctx->kv_self.head = kv_head;
12636
13728
  ctx->kv_self.size = kv_size;
12637
13729
  ctx->kv_self.used = kv_used;
12638
13730
 
12639
13731
  ctx->kv_self.cells.resize(kv_size);
12640
13732
 
12641
- for (uint32_t i = 0; i < kv_size; ++i) {
13733
+ for (uint32_t i = 0; i < kv_head; ++i) {
12642
13734
  llama_pos pos;
12643
13735
  size_t seq_id_size;
12644
13736
 
@@ -12654,6 +13746,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12654
13746
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
12655
13747
  }
12656
13748
  }
13749
+
13750
+ for (uint32_t i = kv_head; i < kv_size; ++i) {
13751
+ ctx->kv_self.cells[i].pos = -1;
13752
+ ctx->kv_self.cells[i].seq_id.clear();
13753
+ }
12657
13754
  }
12658
13755
 
12659
13756
  const size_t nread = inp - src;
@@ -12751,6 +13848,15 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
12751
13848
  ctx->cparams.n_threads_batch = n_threads_batch;
12752
13849
  }
12753
13850
 
13851
+ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
13852
+ ctx->abort_callback = abort_callback;
13853
+ ctx->abort_callback_data = abort_callback_data;
13854
+ }
13855
+
13856
+ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
13857
+ ctx->cparams.causal_attn = causal_attn;
13858
+ }
13859
+
12754
13860
  struct llama_batch llama_batch_get_one(
12755
13861
  llama_token * tokens,
12756
13862
  int32_t n_tokens,
@@ -12817,32 +13923,81 @@ int32_t llama_decode(
12817
13923
  return ret;
12818
13924
  }
12819
13925
 
13926
+ void llama_synchronize(struct llama_context * ctx) {
13927
+ ggml_backend_sched_synchronize(ctx->sched);
13928
+
13929
+ // FIXME: if multiple single tokens are evaluated without a synchronization,
13930
+ // the stats will be added to the prompt evaluation stats
13931
+ // this should only happen when using batch size 1 to evaluate a batch
13932
+
13933
+ // add the evaluation to the stats
13934
+ if (ctx->n_queued_tokens == 1) {
13935
+ ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
13936
+ ctx->n_eval++;
13937
+ } else if (ctx->n_queued_tokens > 1) {
13938
+ ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
13939
+ ctx->n_p_eval += ctx->n_queued_tokens;
13940
+ }
13941
+
13942
+ // get a more accurate load time, upon first eval
13943
+ if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
13944
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
13945
+ ctx->has_evaluated_once = true;
13946
+ }
13947
+
13948
+ ctx->n_queued_tokens = 0;
13949
+ ctx->t_compute_start_us = 0;
13950
+ }
13951
+
12820
13952
  float * llama_get_logits(struct llama_context * ctx) {
12821
- return ctx->logits.data();
13953
+ llama_synchronize(ctx);
13954
+
13955
+ return ctx->logits;
12822
13956
  }
12823
13957
 
12824
13958
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
12825
13959
  assert(ctx->logits_valid.at(i));
12826
- return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
13960
+
13961
+ llama_synchronize(ctx);
13962
+
13963
+ return ctx->logits + i*ctx->model.hparams.n_vocab;
12827
13964
  }
12828
13965
 
12829
13966
  float * llama_get_embeddings(struct llama_context * ctx) {
12830
- return ctx->embedding.data();
13967
+ llama_synchronize(ctx);
13968
+
13969
+ return ctx->embd;
12831
13970
  }
12832
13971
 
12833
13972
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12834
- return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
13973
+ llama_synchronize(ctx);
13974
+
13975
+ return ctx->embd + i*ctx->model.hparams.n_embd;
13976
+ }
13977
+
13978
+ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
13979
+ llama_synchronize(ctx);
13980
+
13981
+ auto it = ctx->embd_seq.find(seq_id);
13982
+ if (it == ctx->embd_seq.end()) {
13983
+ return nullptr;
13984
+ }
13985
+
13986
+ return it->second.data();
12835
13987
  }
12836
13988
 
12837
13989
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
13990
+ GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
12838
13991
  return model->vocab.id_to_token[token].text.c_str();
12839
13992
  }
12840
13993
 
12841
13994
  float llama_token_get_score(const struct llama_model * model, llama_token token) {
13995
+ GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
12842
13996
  return model->vocab.id_to_token[token].score;
12843
13997
  }
12844
13998
 
12845
13999
  llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
14000
+ GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
12846
14001
  return model->vocab.id_to_token[token].type;
12847
14002
  }
12848
14003
 
@@ -12887,12 +14042,12 @@ int32_t llama_tokenize(
12887
14042
  const char * text,
12888
14043
  int32_t text_len,
12889
14044
  llama_token * tokens,
12890
- int32_t n_max_tokens,
14045
+ int32_t n_tokens_max,
12891
14046
  bool add_bos,
12892
14047
  bool special) {
12893
14048
  auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
12894
14049
 
12895
- if (n_max_tokens < (int) res.size()) {
14050
+ if (n_tokens_max < (int) res.size()) {
12896
14051
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
12897
14052
  return -((int) res.size());
12898
14053
  }
@@ -12906,9 +14061,9 @@ int32_t llama_tokenize(
12906
14061
 
12907
14062
  static std::string llama_decode_text(const std::string & text) {
12908
14063
  std::string decoded_text;
12909
- auto unicode_sequences = codepoints_from_utf8(text);
12910
- for (auto& unicode_sequence : unicode_sequences) {
12911
- decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
14064
+ auto unicode_sequences = unicode_cpts_from_utf8(text);
14065
+ for (auto & unicode_sequence : unicode_sequences) {
14066
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
12912
14067
  }
12913
14068
 
12914
14069
  return decoded_text;
@@ -12933,7 +14088,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
12933
14088
  } else if (llama_is_user_defined_token(model->vocab, token)) {
12934
14089
  std::string result = model->vocab.id_to_token[token].text;
12935
14090
  if (length < (int) result.length()) {
12936
- return -result.length();
14091
+ return -(int) result.length();
12937
14092
  }
12938
14093
  memcpy(buf, result.c_str(), result.length());
12939
14094
  return result.length();
@@ -12968,7 +14123,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
12968
14123
  } else if (llama_is_user_defined_token(model->vocab, token)) {
12969
14124
  std::string result = model->vocab.id_to_token[token].text;
12970
14125
  if (length < (int) result.length()) {
12971
- return -result.length();
14126
+ return -(int) result.length();
12972
14127
  }
12973
14128
  memcpy(buf, result.c_str(), result.length());
12974
14129
  return result.length();
@@ -13005,7 +14160,7 @@ static int32_t llama_chat_apply_template_internal(
13005
14160
  std::string & dest, bool add_ass) {
13006
14161
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
13007
14162
  std::stringstream ss;
13008
- if (tmpl.find("<|im_start|>") != std::string::npos) {
14163
+ if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
13009
14164
  // chatml template
13010
14165
  for (auto message : chat) {
13011
14166
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -13013,7 +14168,7 @@ static int32_t llama_chat_apply_template_internal(
13013
14168
  if (add_ass) {
13014
14169
  ss << "<|im_start|>assistant\n";
13015
14170
  }
13016
- } else if (tmpl.find("[INST]") != std::string::npos) {
14171
+ } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
13017
14172
  // llama2 template and its variants
13018
14173
  // [variant] support system message
13019
14174
  bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
@@ -13048,7 +14203,7 @@ static int32_t llama_chat_apply_template_internal(
13048
14203
  }
13049
14204
  }
13050
14205
  // llama2 templates seem to not care about "add_generation_prompt"
13051
- } else if (tmpl.find("<|user|>") != std::string::npos) {
14206
+ } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
13052
14207
  // zephyr template
13053
14208
  for (auto message : chat) {
13054
14209
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -13056,7 +14211,7 @@ static int32_t llama_chat_apply_template_internal(
13056
14211
  if (add_ass) {
13057
14212
  ss << "<|assistant|>\n";
13058
14213
  }
13059
- } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
14214
+ } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
13060
14215
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
13061
14216
  for (auto message : chat) {
13062
14217
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -13065,7 +14220,7 @@ static int32_t llama_chat_apply_template_internal(
13065
14220
  if (add_ass) {
13066
14221
  ss << "<s>assistant\n";
13067
14222
  }
13068
- } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
14223
+ } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
13069
14224
  // google/gemma-7b-it
13070
14225
  std::string system_prompt = "";
13071
14226
  for (auto message : chat) {
@@ -13087,6 +14242,26 @@ static int32_t llama_chat_apply_template_internal(
13087
14242
  if (add_ass) {
13088
14243
  ss << "<start_of_turn>model\n";
13089
14244
  }
14245
+ } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
14246
+ // OrionStarAI/Orion-14B-Chat
14247
+ std::string system_prompt = "";
14248
+ for (auto message : chat) {
14249
+ std::string role(message->role);
14250
+ if (role == "system") {
14251
+ // there is no system message support, we will merge it with user prompt
14252
+ system_prompt = message->content;
14253
+ continue;
14254
+ } else if (role == "user") {
14255
+ ss << "Human: ";
14256
+ if (!system_prompt.empty()) {
14257
+ ss << system_prompt << "\n\n";
14258
+ system_prompt = "";
14259
+ }
14260
+ ss << message->content << "\n\nAssistant: </s>";
14261
+ } else {
14262
+ ss << message->content << "</s>";
14263
+ }
14264
+ }
13090
14265
  } else {
13091
14266
  // template not supported
13092
14267
  return -1;
@@ -13112,23 +14287,27 @@ LLAMA_API int32_t llama_chat_apply_template(
13112
14287
  int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
13113
14288
  if (res < 0) {
13114
14289
  // worst case: there is no information about template, we will use chatml by default
13115
- curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
14290
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
13116
14291
  } else {
13117
14292
  curr_tmpl = std::string(model_template.data(), model_template.size());
13118
14293
  }
13119
14294
  }
14295
+
13120
14296
  // format the chat to string
13121
14297
  std::vector<const llama_chat_message *> chat_vec;
13122
14298
  chat_vec.resize(n_msg);
13123
14299
  for (size_t i = 0; i < n_msg; i++) {
13124
14300
  chat_vec[i] = &chat[i];
13125
14301
  }
14302
+
13126
14303
  std::string formatted_chat;
13127
14304
  int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
13128
14305
  if (res < 0) {
13129
14306
  return res;
13130
14307
  }
13131
- strncpy(buf, formatted_chat.c_str(), length);
14308
+ if (buf && length > 0) {
14309
+ strncpy(buf, formatted_chat.c_str(), length);
14310
+ }
13132
14311
  return res;
13133
14312
  }
13134
14313