llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -104,6 +104,7 @@
104
104
  #define LLAMA_MAX_NODES 8192
105
105
  #define LLAMA_MAX_EXPERTS 8
106
106
 
107
+
107
108
  //
108
109
  // logging
109
110
  //
@@ -211,10 +212,12 @@ enum llm_arch {
211
212
  LLM_ARCH_INTERNLM2,
212
213
  LLM_ARCH_MINICPM,
213
214
  LLM_ARCH_GEMMA,
215
+ LLM_ARCH_STARCODER2,
216
+ LLM_ARCH_MAMBA,
214
217
  LLM_ARCH_UNKNOWN,
215
218
  };
216
219
 
217
- static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
220
+ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
218
221
  { LLM_ARCH_LLAMA, "llama" },
219
222
  { LLM_ARCH_FALCON, "falcon" },
220
223
  { LLM_ARCH_GPT2, "gpt2" },
@@ -238,6 +241,9 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
238
241
  { LLM_ARCH_INTERNLM2, "internlm2" },
239
242
  { LLM_ARCH_MINICPM, "minicpm" },
240
243
  { LLM_ARCH_GEMMA, "gemma" },
244
+ { LLM_ARCH_STARCODER2, "starcoder2" },
245
+ { LLM_ARCH_MAMBA, "mamba" },
246
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
241
247
  };
242
248
 
243
249
  enum llm_kv {
@@ -252,6 +258,7 @@ enum llm_kv {
252
258
  LLM_KV_GENERAL_SOURCE_URL,
253
259
  LLM_KV_GENERAL_SOURCE_HF_REPO,
254
260
 
261
+ LLM_KV_VOCAB_SIZE,
255
262
  LLM_KV_CONTEXT_LENGTH,
256
263
  LLM_KV_EMBEDDING_LENGTH,
257
264
  LLM_KV_BLOCK_COUNT,
@@ -280,6 +287,11 @@ enum llm_kv {
280
287
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
281
288
  LLM_KV_ROPE_SCALING_FINETUNED,
282
289
 
290
+ LLM_KV_SSM_INNER_SIZE,
291
+ LLM_KV_SSM_CONV_KERNEL,
292
+ LLM_KV_SSM_STATE_SIZE,
293
+ LLM_KV_SSM_TIME_STEP_RANK,
294
+
283
295
  LLM_KV_TOKENIZER_MODEL,
284
296
  LLM_KV_TOKENIZER_LIST,
285
297
  LLM_KV_TOKENIZER_TOKEN_TYPE,
@@ -298,7 +310,7 @@ enum llm_kv {
298
310
  LLM_KV_TOKENIZER_RWKV,
299
311
  };
300
312
 
301
- static std::map<llm_kv, const char *> LLM_KV_NAMES = {
313
+ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
302
314
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
303
315
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
304
316
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -310,6 +322,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
310
322
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
311
323
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
312
324
 
325
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
313
326
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
314
327
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
315
328
  { LLM_KV_BLOCK_COUNT, "%s.block_count" },
@@ -338,6 +351,11 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
338
351
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
339
352
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
340
353
 
354
+ { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
355
+ { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
356
+ { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
357
+ { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
358
+
341
359
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
342
360
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
343
361
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
@@ -362,7 +380,7 @@ struct LLM_KV {
362
380
  llm_arch arch;
363
381
 
364
382
  std::string operator()(llm_kv kv) const {
365
- return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
383
+ return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
366
384
  }
367
385
  };
368
386
 
@@ -395,9 +413,16 @@ enum llm_tensor {
395
413
  LLM_TENSOR_ATTN_Q_NORM,
396
414
  LLM_TENSOR_ATTN_K_NORM,
397
415
  LLM_TENSOR_LAYER_OUT_NORM,
416
+ LLM_TENSOR_SSM_IN,
417
+ LLM_TENSOR_SSM_CONV1D,
418
+ LLM_TENSOR_SSM_X,
419
+ LLM_TENSOR_SSM_DT,
420
+ LLM_TENSOR_SSM_A,
421
+ LLM_TENSOR_SSM_D,
422
+ LLM_TENSOR_SSM_OUT,
398
423
  };
399
424
 
400
- static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
425
+ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
401
426
  {
402
427
  LLM_ARCH_LLAMA,
403
428
  {
@@ -779,6 +804,40 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
779
804
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
780
805
  },
781
806
  },
807
+ {
808
+ LLM_ARCH_STARCODER2,
809
+ {
810
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
811
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
812
+ { LLM_TENSOR_OUTPUT, "output" },
813
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
814
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
815
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
816
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
817
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
818
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
819
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
820
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
821
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
822
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
823
+ },
824
+ },
825
+ {
826
+ LLM_ARCH_MAMBA,
827
+ {
828
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
829
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
830
+ { LLM_TENSOR_OUTPUT, "output" },
831
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
832
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
833
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
834
+ { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
835
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
836
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
837
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
838
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
839
+ },
840
+ },
782
841
  {
783
842
  LLM_ARCH_UNKNOWN,
784
843
  {
@@ -812,38 +871,38 @@ struct LLM_TN {
812
871
  llm_arch arch;
813
872
 
814
873
  std::string operator()(llm_tensor tensor) const {
815
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
874
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
816
875
  return "__missing__";
817
876
  }
818
- return LLM_TENSOR_NAMES[arch].at(tensor);
877
+ return LLM_TENSOR_NAMES.at(arch).at(tensor);
819
878
  }
820
879
 
821
880
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
822
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
881
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
823
882
  return "__missing__";
824
883
  }
825
- return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
884
+ return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
826
885
  }
827
886
 
828
887
  std::string operator()(llm_tensor tensor, int bid) const {
829
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
888
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
830
889
  return "__missing__";
831
890
  }
832
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
891
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
833
892
  }
834
893
 
835
894
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
836
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
895
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
837
896
  return "__missing__";
838
897
  }
839
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
898
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
840
899
  }
841
900
 
842
901
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
843
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
902
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
844
903
  return "__missing__";
845
904
  }
846
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
905
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
847
906
  }
848
907
  };
849
908
 
@@ -851,16 +910,16 @@ struct LLM_TN {
851
910
  // gguf helpers
852
911
  //
853
912
 
854
- static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
913
+ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
855
914
  { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
856
915
  { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
857
916
  { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
858
917
  };
859
918
 
860
- static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
919
+ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
861
920
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
862
921
  if (kv.second == name) {
863
- return kv.first;
922
+ return (llama_rope_scaling_type) kv.first;
864
923
  }
865
924
  }
866
925
 
@@ -921,21 +980,6 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
921
980
  }
922
981
  }
923
982
 
924
- //
925
- // ggml helpers
926
- //
927
-
928
- static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
929
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
930
-
931
- if (plan.work_size > 0) {
932
- buf.resize(plan.work_size);
933
- plan.work_data = buf.data();
934
- }
935
-
936
- ggml_graph_compute(graph, &plan);
937
- }
938
-
939
983
  //
940
984
  // llama helpers
941
985
  //
@@ -1409,7 +1453,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1409
1453
  buft = ggml_backend_cuda_host_buffer_type();
1410
1454
  }
1411
1455
  #elif defined(GGML_USE_SYCL)
1412
- buft = ggml_backend_sycl_host_buffer_type();
1456
+ if (host_buffer) {
1457
+ buft = ggml_backend_sycl_host_buffer_type();
1458
+ }
1413
1459
  #elif defined(GGML_USE_CPU_HBM)
1414
1460
  buft = ggml_backend_cpu_hbm_buffer_type();
1415
1461
  #elif defined(GGML_USE_VULKAN)
@@ -1463,6 +1509,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1463
1509
  }
1464
1510
  #endif
1465
1511
 
1512
+ #ifdef GGML_USE_SYCL
1513
+ if (ggml_backend_sycl_get_device_count() > 1) {
1514
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1515
+ }
1516
+ #endif
1517
+
1466
1518
  if (buft == nullptr) {
1467
1519
  buft = llama_default_buffer_type_offload(fallback_gpu);
1468
1520
  }
@@ -1474,6 +1526,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1474
1526
  static size_t llama_get_device_count() {
1475
1527
  #if defined(GGML_USE_CUBLAS)
1476
1528
  return ggml_backend_cuda_get_device_count();
1529
+ #elif defined(GGML_USE_SYCL)
1530
+ return ggml_backend_sycl_get_device_count();
1477
1531
  #elif defined(GGML_USE_VULKAN)
1478
1532
  return ggml_backend_vk_get_device_count();
1479
1533
  #else
@@ -1487,6 +1541,11 @@ static size_t llama_get_device_memory(int device) {
1487
1541
  size_t free;
1488
1542
  ggml_backend_cuda_get_device_memory(device, &total, &free);
1489
1543
  return free;
1544
+ #elif defined(GGML_USE_SYCL)
1545
+ size_t total;
1546
+ size_t free;
1547
+ ggml_backend_sycl_get_device_memory(device, &total, &free);
1548
+ return free;
1490
1549
  #elif defined(GGML_USE_VULKAN)
1491
1550
  size_t total;
1492
1551
  size_t free;
@@ -1575,7 +1634,12 @@ struct llama_hparams {
1575
1634
  float rope_freq_base_train;
1576
1635
  float rope_freq_scale_train;
1577
1636
  uint32_t n_yarn_orig_ctx;
1578
- int32_t rope_scaling_type_train;
1637
+
1638
+ // for State Space Models
1639
+ uint32_t ssm_d_conv = 0;
1640
+ uint32_t ssm_d_inner = 0;
1641
+ uint32_t ssm_d_state = 0;
1642
+ uint32_t ssm_dt_rank = 0;
1579
1643
 
1580
1644
  float f_clamp_kqv = 0.0f;
1581
1645
  float f_max_alibi_bias = 0.0f;
@@ -1583,8 +1647,9 @@ struct llama_hparams {
1583
1647
  bool causal_attn = true;
1584
1648
  bool need_kq_pos = false;
1585
1649
 
1586
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1587
- enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1650
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1651
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1652
+ enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
1588
1653
 
1589
1654
  bool operator!=(const llama_hparams & other) const {
1590
1655
  if (this->vocab_only != other.vocab_only) return true;
@@ -1604,6 +1669,11 @@ struct llama_hparams {
1604
1669
  if (this->rope_finetuned != other.rope_finetuned) return true;
1605
1670
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1606
1671
 
1672
+ if (this->ssm_d_conv != other.ssm_d_conv) return true;
1673
+ if (this->ssm_d_inner != other.ssm_d_inner) return true;
1674
+ if (this->ssm_d_state != other.ssm_d_state) return true;
1675
+ if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
1676
+
1607
1677
  const float EPSILON = 1e-9f;
1608
1678
 
1609
1679
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
@@ -1615,6 +1685,9 @@ struct llama_hparams {
1615
1685
  }
1616
1686
 
1617
1687
  uint32_t n_gqa() const {
1688
+ if (n_head_kv == 0) {
1689
+ return 0;
1690
+ }
1618
1691
  return n_head/n_head_kv;
1619
1692
  }
1620
1693
 
@@ -1625,16 +1698,29 @@ struct llama_hparams {
1625
1698
  uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
1626
1699
  return n_embd_head_v * n_head_kv;
1627
1700
  }
1701
+
1702
+ uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
1703
+ // corresponds to Mamba's conv_states size
1704
+ // TODO: maybe support other convolution strides than 1
1705
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
1706
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
1707
+ }
1708
+
1709
+ uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
1710
+ // corresponds to Mamba's ssm_states size
1711
+ return ssm_d_state * ssm_d_inner;
1712
+ }
1628
1713
  };
1629
1714
 
1630
1715
  struct llama_cparams {
1631
- uint32_t n_ctx; // context size used during inference
1716
+ uint32_t n_ctx; // context size used during inference
1632
1717
  uint32_t n_batch;
1718
+ uint32_t n_ubatch;
1633
1719
  uint32_t n_threads; // number of threads to use for generation
1634
1720
  uint32_t n_threads_batch; // number of threads to use for batch processing
1635
1721
 
1636
- float rope_freq_base;
1637
- float rope_freq_scale;
1722
+ float rope_freq_base;
1723
+ float rope_freq_scale;
1638
1724
 
1639
1725
  uint32_t n_yarn_orig_ctx;
1640
1726
  // These hyperparameters are not exposed in GGUF, because all
@@ -1645,8 +1731,11 @@ struct llama_cparams {
1645
1731
  float yarn_beta_slow;
1646
1732
  float defrag_thold;
1647
1733
 
1734
+ bool embeddings;
1735
+ bool causal_attn;
1648
1736
  bool offload_kqv;
1649
- bool do_pooling;
1737
+
1738
+ enum llama_pooling_type pooling_type;
1650
1739
 
1651
1740
  ggml_backend_sched_eval_callback cb_eval;
1652
1741
  void * cb_eval_user_data;
@@ -1700,11 +1789,27 @@ struct llama_layer {
1700
1789
  struct ggml_tensor * ffn_down_b; // b2
1701
1790
  struct ggml_tensor * ffn_up_b; // b3
1702
1791
  struct ggml_tensor * ffn_act;
1792
+
1793
+ // mamba proj
1794
+ struct ggml_tensor * ssm_in;
1795
+ struct ggml_tensor * ssm_x;
1796
+ struct ggml_tensor * ssm_dt;
1797
+ struct ggml_tensor * ssm_out;
1798
+
1799
+ // mamba
1800
+ struct ggml_tensor * ssm_conv1d;
1801
+ struct ggml_tensor * ssm_a;
1802
+ struct ggml_tensor * ssm_d;
1803
+
1804
+ // mamba bias
1805
+ struct ggml_tensor * ssm_conv1d_b;
1806
+ struct ggml_tensor * ssm_dt_b;
1703
1807
  };
1704
1808
 
1705
1809
  struct llama_kv_cell {
1706
1810
  llama_pos pos = -1;
1707
1811
  llama_pos delta = 0;
1812
+ int32_t src = 0; // used by recurrent state models to copy states
1708
1813
 
1709
1814
  std::set<llama_seq_id> seq_id;
1710
1815
 
@@ -1725,6 +1830,9 @@ struct llama_kv_cell {
1725
1830
  struct llama_kv_cache {
1726
1831
  bool has_shift = false;
1727
1832
  bool do_defrag = false;
1833
+ bool do_copy = false;
1834
+ // with recurrent state models, a cell can hold the state for more than one past token
1835
+ bool recurrent = false;
1728
1836
 
1729
1837
  // Note: The value of head isn't only used to optimize searching
1730
1838
  // for a free KV slot. llama_decode_internal also uses it, so it
@@ -1904,8 +2012,7 @@ struct llama_context {
1904
2012
  ggml_vk_free_cpu_assist();
1905
2013
  #endif
1906
2014
 
1907
- ggml_backend_buffer_free(buf_input);
1908
- ggml_free(ctx_input);
2015
+ ggml_backend_buffer_free(buf_output);
1909
2016
  }
1910
2017
 
1911
2018
  llama_cparams cparams;
@@ -1931,36 +2038,54 @@ struct llama_context {
1931
2038
  int64_t t_p_eval_us = 0;
1932
2039
  int64_t t_eval_us = 0;
1933
2040
 
2041
+ int64_t t_compute_start_us = 0;
2042
+ int64_t n_queued_tokens = 0;
2043
+
1934
2044
  int32_t n_sample = 0; // number of tokens sampled
1935
2045
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1936
2046
  int32_t n_eval = 0; // number of eval calls
1937
2047
 
2048
+ // host buffer for the model output (logits and embeddings)
2049
+ ggml_backend_buffer_t buf_output = nullptr;
2050
+
1938
2051
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1939
- std::vector<float> logits;
2052
+ size_t logits_size = 0;
2053
+ float * logits = nullptr;
2054
+
1940
2055
  #ifndef NDEBUG
1941
2056
  // guard against access to unset logits
1942
2057
  std::vector<bool> logits_valid;
1943
2058
  #endif
1944
2059
  bool logits_all = false;
1945
2060
 
1946
- // input embedding (1-dimensional array: [n_embd])
1947
- std::vector<float> embedding;
2061
+ // embeddings output (2-dimensional array: [n_tokens][n_embd])
2062
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
2063
+ size_t embd_size = 0;
2064
+ float * embd = nullptr;
2065
+
2066
+ // sequence embeddings output (map of [n_embd] vectors)
2067
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
2068
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
1948
2069
 
1949
2070
  // memory buffers used to evaluate the model
1950
2071
  std::vector<uint8_t> buf_compute_meta;
1951
2072
  ggml_backend_sched_t sched = nullptr;
1952
2073
 
2074
+ ggml_abort_callback abort_callback = nullptr;
2075
+ void * abort_callback_data = nullptr;
2076
+
1953
2077
  // input tensors
1954
- ggml_backend_buffer_t buf_input = nullptr;
1955
- ggml_context * ctx_input = nullptr;
1956
2078
  struct ggml_tensor * inp_tokens; // I32 [n_batch]
1957
2079
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1958
2080
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1959
- struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1960
- struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
1961
- struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
2081
+ struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2082
+ struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
2083
+ struct ggml_tensor * inp_K_shift; // I32 [kv_size]
1962
2084
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1963
2085
  struct ggml_tensor * inp_cls; // I32 [n_batch]
2086
+ struct ggml_tensor * inp_s_copy; // I32 [kv_size]
2087
+ struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
2088
+ struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
1964
2089
 
1965
2090
  #ifdef GGML_USE_MPI
1966
2091
  ggml_mpi_context * ctx_mpi = NULL;
@@ -1976,25 +2101,42 @@ static bool llama_kv_cache_init(
1976
2101
  const llama_model & model,
1977
2102
  ggml_type type_k,
1978
2103
  ggml_type type_v,
1979
- uint32_t n_ctx,
2104
+ uint32_t kv_size,
1980
2105
  bool offload) {
1981
2106
  const struct llama_hparams & hparams = model.hparams;
1982
2107
 
1983
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1984
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
2108
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
2109
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
1985
2110
  const int64_t n_layer = hparams.n_layer;
1986
2111
 
1987
2112
  cache.has_shift = false;
1988
2113
 
2114
+ // TODO: find a nicer way to add other recurrent model architectures
2115
+ cache.recurrent = model.arch == LLM_ARCH_MAMBA;
2116
+
2117
+ // TODO: support mixed reccurent Transformer architectues
2118
+ // NOTE: (!a || b) is a logical implication (a -> b)
2119
+ GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
2120
+ GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
2121
+ GGML_ASSERT( cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_gqa());
2122
+ GGML_ASSERT( cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_gqa());
2123
+
1989
2124
  cache.head = 0;
1990
- cache.size = n_ctx;
2125
+ cache.size = kv_size;
1991
2126
  cache.used = 0;
1992
2127
 
1993
2128
  cache.type_k = type_k;
1994
2129
  cache.type_v = type_v;
1995
2130
 
1996
2131
  cache.cells.clear();
1997
- cache.cells.resize(n_ctx);
2132
+ cache.cells.resize(kv_size);
2133
+
2134
+ if (cache.recurrent) {
2135
+ // init state copy sources
2136
+ for (uint32_t i = 0; i < cache.size; ++i) {
2137
+ cache.cells[i].src = i;
2138
+ }
2139
+ }
1998
2140
 
1999
2141
  #ifdef GGML_USE_CLBLAST
2000
2142
  offload = false;
@@ -2033,8 +2175,8 @@ static bool llama_kv_cache_init(
2033
2175
 
2034
2176
  for (int i = 0; i < (int) n_layer; i++) {
2035
2177
  struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
2036
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
2037
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
2178
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
2179
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
2038
2180
  ggml_format_name(k, "cache_k_l%d", i);
2039
2181
  ggml_format_name(v, "cache_v_l%d", i);
2040
2182
  cache.k_l.push_back(k);
@@ -2068,6 +2210,54 @@ static bool llama_kv_cache_find_slot(
2068
2210
  const uint32_t n_ctx = cache.size;
2069
2211
  const uint32_t n_tokens = batch.n_tokens;
2070
2212
 
2213
+ if (cache.recurrent) {
2214
+ // For recurrent state architectures (like Mamba),
2215
+ // each KV cache cell can store the state for a whole sequence.
2216
+
2217
+ llama_seq_id min = cache.size - 1;
2218
+ llama_seq_id max = 0;
2219
+
2220
+ for (uint32_t i = 0; i < n_tokens; ++i) {
2221
+ for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) {
2222
+ llama_seq_id seq_id = batch.seq_id[i][j];
2223
+ // make sure it's a valid seq_id
2224
+ if ((uint32_t) seq_id < cache.size) {
2225
+ if (seq_id > max) {
2226
+ max = seq_id;
2227
+ }
2228
+ if (seq_id < min) {
2229
+ min = seq_id;
2230
+ }
2231
+ // Assuming the tokens are in-order
2232
+ if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
2233
+ // What should happen when the pos backtracks or skips a value?
2234
+ // Clearing the state mid-batch would require special-casing which isn't done.
2235
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
2236
+ __func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
2237
+ }
2238
+ if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) {
2239
+ cache.used += 1;
2240
+ }
2241
+ cache.cells[seq_id].pos = batch.pos[i];
2242
+ // NOTE: seq_ids are not inserted here; they are handled when the input tensors are set
2243
+ } else {
2244
+ // too big seq_id
2245
+ // TODO: would it be possible to resize the KV cache size instead?
2246
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
2247
+ return false;
2248
+ }
2249
+ }
2250
+ }
2251
+
2252
+ // allow getting the range of used cells, from head to head + n
2253
+ cache.head = min;
2254
+ cache.n = max - min + 1;
2255
+
2256
+ // sanity check
2257
+ return max >= min;
2258
+ }
2259
+ // otherwise, one cell per token.
2260
+
2071
2261
  if (n_tokens > n_ctx) {
2072
2262
  LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
2073
2263
  return false;
@@ -2116,10 +2306,12 @@ static bool llama_kv_cache_find_slot(
2116
2306
  }
2117
2307
 
2118
2308
  // find how many cells are currently in use
2119
- static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2120
- for (uint32_t i = cache.size - 1; i > 0; --i) {
2121
- if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
2122
- return i + 1;
2309
+ static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2310
+ for (uint32_t i = cache.size; i > 0; --i) {
2311
+ const llama_kv_cell & cell = cache.cells[i - 1];
2312
+
2313
+ if (cell.pos >= 0 && !cell.is_empty()) {
2314
+ return i;
2123
2315
  }
2124
2316
  }
2125
2317
 
@@ -2135,7 +2327,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
2135
2327
  cache.used = 0;
2136
2328
  }
2137
2329
 
2138
- static void llama_kv_cache_seq_rm(
2330
+ static bool llama_kv_cache_seq_rm(
2139
2331
  struct llama_kv_cache & cache,
2140
2332
  llama_seq_id seq_id,
2141
2333
  llama_pos p0,
@@ -2145,6 +2337,25 @@ static void llama_kv_cache_seq_rm(
2145
2337
  if (p0 < 0) p0 = 0;
2146
2338
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
2147
2339
 
2340
+ // models like Mamba can't have a state partially erased
2341
+ if (cache.recurrent) {
2342
+ if (seq_id >= (int64_t) cache.size) {
2343
+ // could be fatal
2344
+ return false;
2345
+ }
2346
+ if (0 <= seq_id) {
2347
+ // partial intersection is invalid
2348
+ if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) {
2349
+ return false;
2350
+ }
2351
+ } else {
2352
+ // seq_id is negative, then the range should include everything or nothing
2353
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
2354
+ return false;
2355
+ }
2356
+ }
2357
+ }
2358
+
2148
2359
  for (uint32_t i = 0; i < cache.size; ++i) {
2149
2360
  if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
2150
2361
  if (seq_id < 0) {
@@ -2166,6 +2377,8 @@ static void llama_kv_cache_seq_rm(
2166
2377
 
2167
2378
  // If we freed up a slot, set head to it so searching can start there.
2168
2379
  if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
2380
+
2381
+ return true;
2169
2382
  }
2170
2383
 
2171
2384
  static void llama_kv_cache_seq_cp(
@@ -2177,6 +2390,29 @@ static void llama_kv_cache_seq_cp(
2177
2390
  if (p0 < 0) p0 = 0;
2178
2391
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
2179
2392
 
2393
+ if (cache.recurrent) {
2394
+ if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
2395
+ seq_id_src = cache.cells[seq_id_src].src;
2396
+ GGML_ASSERT((uint32_t) seq_id_src < cache.size);
2397
+ // intent to "copy from"
2398
+ // supports copy chains thanks to taking the source of the source
2399
+ cache.cells[seq_id_dst].src = seq_id_src;
2400
+
2401
+ // preserve the "keep or clear" status of the copied sequence
2402
+ if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) {
2403
+ cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
2404
+ } else {
2405
+ cache.cells[seq_id_dst].seq_id.erase(seq_id_dst);
2406
+ }
2407
+
2408
+ cache.do_copy = true;
2409
+
2410
+ cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
2411
+ }
2412
+ return;
2413
+ }
2414
+ // otherwise, this is the KV cache of a Transformer-like model
2415
+
2180
2416
  cache.head = 0;
2181
2417
 
2182
2418
  for (uint32_t i = 0; i < cache.size; ++i) {
@@ -2216,6 +2452,17 @@ static void llama_kv_cache_seq_add(
2216
2452
  if (p0 < 0) p0 = 0;
2217
2453
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
2218
2454
 
2455
+ if (cache.recurrent) {
2456
+ // for Mamba-like models, only the pos needs to be shifted
2457
+ if (0 <= seq_id && seq_id < (int64_t) cache.size) {
2458
+ llama_kv_cell & cell = cache.cells[seq_id];
2459
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
2460
+ cell.pos += delta;
2461
+ }
2462
+ }
2463
+ return;
2464
+ }
2465
+
2219
2466
  for (uint32_t i = 0; i < cache.size; ++i) {
2220
2467
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
2221
2468
  cache.has_shift = true;
@@ -2249,6 +2496,17 @@ static void llama_kv_cache_seq_div(
2249
2496
  if (p0 < 0) p0 = 0;
2250
2497
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
2251
2498
 
2499
+ if (cache.recurrent) {
2500
+ // for Mamba-like models, only the pos needs to be changed
2501
+ if (0 <= seq_id && seq_id < (int64_t) cache.size) {
2502
+ llama_kv_cell & cell = cache.cells[seq_id];
2503
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
2504
+ cell.pos /= d;
2505
+ }
2506
+ }
2507
+ return;
2508
+ }
2509
+
2252
2510
  for (uint32_t i = 0; i < cache.size; ++i) {
2253
2511
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
2254
2512
  cache.has_shift = true;
@@ -2891,7 +3149,11 @@ template<>
2891
3149
  bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
2892
3150
  uint32_t tmp;
2893
3151
  const bool found = get_key(kid, tmp, required);
2894
- result = (enum llama_pooling_type) tmp;
3152
+ if (found) {
3153
+ result = (enum llama_pooling_type) tmp;
3154
+ } else {
3155
+ result = LLAMA_POOLING_TYPE_UNSPECIFIED;
3156
+ }
2895
3157
  return found;
2896
3158
  }
2897
3159
 
@@ -2982,10 +3244,11 @@ static const char * llama_model_type_name(e_model type) {
2982
3244
 
2983
3245
  static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2984
3246
  switch (type) {
2985
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2986
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2987
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2988
- default: return "unknown";
3247
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
3248
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
3249
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
3250
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
3251
+ default: return "unknown";
2989
3252
  }
2990
3253
  }
2991
3254
 
@@ -3017,14 +3280,14 @@ static void llm_load_hparams(
3017
3280
  ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
3018
3281
 
3019
3282
  // get hparams kv
3020
- ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3021
- ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3022
- ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3023
- ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
3024
- ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
3025
- ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
3026
- ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
3027
- ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
3283
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3284
+ ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3285
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3286
+ ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
3287
+ ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
3288
+ ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
3289
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
3290
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
3028
3291
 
3029
3292
  GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
3030
3293
  GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -3064,7 +3327,7 @@ static void llm_load_hparams(
3064
3327
 
3065
3328
  // sanity check for n_rot (optional)
3066
3329
  {
3067
- hparams.n_rot = hparams.n_embd / hparams.n_head;
3330
+ hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
3068
3331
 
3069
3332
  ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
3070
3333
 
@@ -3077,10 +3340,10 @@ static void llm_load_hparams(
3077
3340
  // gpt-j n_rot = rotary_dim
3078
3341
  }
3079
3342
 
3080
- hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
3343
+ hparams.n_embd_head_k = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
3081
3344
  ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
3082
3345
 
3083
- hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
3346
+ hparams.n_embd_head_v = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
3084
3347
  ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
3085
3348
 
3086
3349
  // arch-specific KVs
@@ -3168,7 +3431,7 @@ static void llm_load_hparams(
3168
3431
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3169
3432
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3170
3433
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3171
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3434
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
3172
3435
 
3173
3436
  switch (hparams.n_layer) {
3174
3437
  case 3:
@@ -3320,6 +3583,46 @@ static void llm_load_hparams(
3320
3583
  default: model.type = e_model::MODEL_UNKNOWN;
3321
3584
  }
3322
3585
  } break;
3586
+ case LLM_ARCH_STARCODER2:
3587
+ {
3588
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3589
+ switch (hparams.n_layer) {
3590
+ case 30: model.type = e_model::MODEL_3B; break;
3591
+ case 32: model.type = e_model::MODEL_7B; break;
3592
+ case 40: model.type = e_model::MODEL_15B; break;
3593
+ default: model.type = e_model::MODEL_UNKNOWN;
3594
+ }
3595
+ } break;
3596
+ case LLM_ARCH_MAMBA:
3597
+ {
3598
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
3599
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
3600
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
3601
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
3602
+
3603
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3604
+
3605
+ switch (hparams.n_layer) {
3606
+ case 24:
3607
+ switch (hparams.n_embd) {
3608
+ case 768: model.type = e_model::MODEL_SMALL; break;
3609
+ default: model.type = e_model::MODEL_UNKNOWN;
3610
+ } break;
3611
+ case 48:
3612
+ switch (hparams.n_embd) {
3613
+ case 1024: model.type = e_model::MODEL_MEDIUM; break;
3614
+ case 1536: model.type = e_model::MODEL_LARGE; break;
3615
+ case 2048: model.type = e_model::MODEL_XL; break;
3616
+ default: model.type = e_model::MODEL_UNKNOWN;
3617
+ } break;
3618
+ case 64:
3619
+ switch (hparams.n_embd) {
3620
+ case 2560: model.type = e_model::MODEL_3B; break;
3621
+ default: model.type = e_model::MODEL_UNKNOWN;
3622
+ } break;
3623
+ default: model.type = e_model::MODEL_UNKNOWN;
3624
+ }
3625
+ } break;
3323
3626
  default: (void)0;
3324
3627
  }
3325
3628
 
@@ -3345,30 +3648,25 @@ static void llm_load_vocab(
3345
3648
 
3346
3649
  const auto kv = LLM_KV(model.arch);
3347
3650
 
3348
- const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
3349
- if (token_idx == -1) {
3350
- throw std::runtime_error("cannot find tokenizer vocab in model file\n");
3351
- }
3352
-
3353
- const float * scores = nullptr;
3354
- const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
3355
- if (score_idx != -1) {
3356
- scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
3357
- }
3358
-
3359
- const int * toktypes = nullptr;
3360
- const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
3361
- if (toktype_idx != -1) {
3362
- toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
3363
- }
3364
-
3365
3651
  // determine vocab type
3366
3652
  {
3367
3653
  std::string tokenizer_name;
3368
3654
 
3369
3655
  ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
3370
3656
 
3371
- if (tokenizer_name == "llama") {
3657
+ if (tokenizer_name == "no_vocab") {
3658
+ vocab.type = LLAMA_VOCAB_TYPE_NONE;
3659
+
3660
+ // default special tokens
3661
+ vocab.special_bos_id = -1;
3662
+ vocab.special_eos_id = -1;
3663
+ vocab.special_unk_id = -1;
3664
+ vocab.special_sep_id = -1;
3665
+ vocab.special_pad_id = -1;
3666
+ vocab.linefeed_id = -1;
3667
+
3668
+ return;
3669
+ } else if (tokenizer_name == "llama") {
3372
3670
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
3373
3671
 
3374
3672
  // default special tokens
@@ -3395,7 +3693,7 @@ static void llm_load_vocab(
3395
3693
 
3396
3694
  for (int i = 0; i < n_merges; i++) {
3397
3695
  const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
3398
- GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
3696
+ GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
3399
3697
 
3400
3698
  std::string first;
3401
3699
  std::string second;
@@ -3434,13 +3732,30 @@ static void llm_load_vocab(
3434
3732
  }
3435
3733
  }
3436
3734
 
3735
+ const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
3736
+ if (token_idx == -1) {
3737
+ throw std::runtime_error("cannot find tokenizer vocab in model file\n");
3738
+ }
3739
+
3740
+ const float * scores = nullptr;
3741
+ const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
3742
+ if (score_idx != -1) {
3743
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
3744
+ }
3745
+
3746
+ const int * toktypes = nullptr;
3747
+ const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
3748
+ if (toktype_idx != -1) {
3749
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
3750
+ }
3751
+
3437
3752
  const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
3438
3753
 
3439
3754
  vocab.id_to_token.resize(n_vocab);
3440
3755
 
3441
3756
  for (uint32_t i = 0; i < n_vocab; i++) {
3442
3757
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
3443
- GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
3758
+ GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
3444
3759
 
3445
3760
  vocab.token_to_id[word] = i;
3446
3761
 
@@ -3632,6 +3947,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3632
3947
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3633
3948
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3634
3949
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3950
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
3635
3951
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3636
3952
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3637
3953
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
@@ -3639,6 +3955,10 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3639
3955
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
3640
3956
  LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
3641
3957
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
3958
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
3959
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
3960
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
3961
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
3642
3962
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
3643
3963
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
3644
3964
  if (ml.n_elements >= 1e12) {
@@ -3692,6 +4012,7 @@ static bool llm_load_tensors(
3692
4012
 
3693
4013
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
3694
4014
  model.buft_input = llama_default_buffer_type_cpu(true);
4015
+ //model.buft_input = llama_default_buffer_type_offload(main_gpu);
3695
4016
 
3696
4017
  model.buft_layer.resize(n_layer);
3697
4018
 
@@ -3825,7 +4146,13 @@ static bool llm_load_tensors(
3825
4146
  {
3826
4147
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3827
4148
  if (model.arch != LLM_ARCH_MINICPM){
3828
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4149
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4150
+ // if output is NULL, init from the input tok embed
4151
+ if (model.output == NULL) {
4152
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4153
+ ml.n_created--; // artificial tensor
4154
+ ml.size_data += ggml_nbytes(model.output);
4155
+ }
3829
4156
  }
3830
4157
  }
3831
4158
 
@@ -4490,6 +4817,107 @@ static bool llm_load_tensors(
4490
4817
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4491
4818
  }
4492
4819
  } break;
4820
+ case LLM_ARCH_STARCODER2:
4821
+ {
4822
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4823
+
4824
+ // output
4825
+ {
4826
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4827
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4828
+
4829
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4830
+ // if output is NULL, init from the input tok embed
4831
+ if (model.output == NULL) {
4832
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4833
+ ml.n_created--; // artificial tensor
4834
+ ml.size_data += ggml_nbytes(model.output);
4835
+ }
4836
+
4837
+ }
4838
+
4839
+ for (int i = 0; i < n_layer; ++i) {
4840
+ ggml_context * ctx_layer = ctx_for_layer(i);
4841
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4842
+
4843
+ auto & layer = model.layers[i];
4844
+
4845
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4846
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4847
+
4848
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4849
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4850
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4851
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4852
+
4853
+ // optional bias tensors
4854
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
4855
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
4856
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
4857
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4858
+
4859
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4860
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4861
+
4862
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4863
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4864
+
4865
+ // optional bias tensors
4866
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4867
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
4868
+ }
4869
+ } break;
4870
+ case LLM_ARCH_MAMBA:
4871
+ {
4872
+ const int64_t d_conv = hparams.ssm_d_conv;
4873
+ const int64_t d_inner = hparams.ssm_d_inner;
4874
+ const int64_t d_state = hparams.ssm_d_state;
4875
+ const int64_t dt_rank = hparams.ssm_dt_rank;
4876
+ // only an expansion factor of 2 is supported for now
4877
+ GGML_ASSERT(2 * n_embd == d_inner);
4878
+
4879
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4880
+
4881
+ // output
4882
+ {
4883
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4884
+
4885
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4886
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
4887
+ if (model.output == NULL) {
4888
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4889
+ ml.n_created--; // artificial tensor
4890
+ ml.size_data += ggml_nbytes(model.output);
4891
+ }
4892
+ }
4893
+
4894
+ for (int i = 0; i < n_layer; ++i) {
4895
+ ggml_context * ctx_layer = ctx_for_layer(i);
4896
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4897
+
4898
+ auto & layer = model.layers[i];
4899
+
4900
+ // norm
4901
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4902
+
4903
+ layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner});
4904
+
4905
+ layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner});
4906
+ layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner});
4907
+
4908
+ layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state});
4909
+
4910
+ layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner});
4911
+ layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner});
4912
+
4913
+ // no "weight" suffix for these
4914
+ layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner});
4915
+ layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, i), {d_inner});
4916
+
4917
+ // out_proj
4918
+ layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
4919
+ }
4920
+ } break;
4493
4921
  default:
4494
4922
  throw std::runtime_error("unknown architecture");
4495
4923
  }
@@ -4610,7 +5038,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4610
5038
 
4611
5039
  llm_load_print_meta(ml, model);
4612
5040
 
4613
- if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
5041
+ if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
5042
+ model.hparams.n_vocab != model.vocab.id_to_token.size()) {
4614
5043
  throw std::runtime_error("vocab size mismatch");
4615
5044
  }
4616
5045
 
@@ -4674,29 +5103,32 @@ enum llm_norm_type {
4674
5103
 
4675
5104
  static struct ggml_tensor * llm_build_inp_embd(
4676
5105
  struct ggml_context * ctx,
5106
+ struct llama_context & lctx,
4677
5107
  const llama_hparams & hparams,
4678
5108
  const llama_batch & batch,
4679
5109
  struct ggml_tensor * tok_embd,
4680
- struct ggml_tensor * inp_tokens,
4681
- struct ggml_tensor * inp_embd,
4682
5110
  const llm_build_cb & cb) {
4683
5111
  const int64_t n_embd = hparams.n_embd;
4684
5112
 
4685
5113
  struct ggml_tensor * inpL;
4686
5114
 
4687
5115
  if (batch.token) {
4688
- struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
4689
- cb(inp_tokens, "inp_tokens", -1);
5116
+ lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
5117
+ cb(lctx.inp_tokens, "inp_tokens", -1);
5118
+ ggml_set_input(lctx.inp_tokens);
4690
5119
 
4691
- inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
5120
+ inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
4692
5121
  } else {
4693
5122
  #ifdef GGML_USE_MPI
4694
5123
  GGML_ASSERT(false && "not implemented");
4695
5124
  #endif
4696
-
4697
- inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
5125
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
5126
+ inpL = lctx.inp_embd;
5127
+ ggml_set_input(lctx.inp_embd);
4698
5128
  }
4699
5129
 
5130
+ cb(inpL, "inp_embd", -1);
5131
+
4700
5132
  return inpL;
4701
5133
  }
4702
5134
 
@@ -4715,6 +5147,8 @@ static void llm_build_kv_store(
4715
5147
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4716
5148
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4717
5149
 
5150
+ GGML_ASSERT(kv.size == n_ctx);
5151
+
4718
5152
  // compute the transposed [n_tokens, n_embd] V matrix
4719
5153
  struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
4720
5154
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
@@ -4901,8 +5335,8 @@ static struct ggml_tensor * llm_build_kqv(
4901
5335
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4902
5336
  }
4903
5337
 
4904
- #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
4905
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
5338
+ #if defined(GGML_USE_KOMPUTE)
5339
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
4906
5340
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4907
5341
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4908
5342
  if (hparams.f_max_alibi_bias > 0.0f) {
@@ -4924,6 +5358,8 @@ static struct ggml_tensor * llm_build_kqv(
4924
5358
  cb(kq, "kq_soft_max_ext", il);
4925
5359
  }
4926
5360
 
5361
+ GGML_ASSERT(kv.size == n_ctx);
5362
+
4927
5363
  // split cached v into n_head heads
4928
5364
  struct ggml_tensor * v =
4929
5365
  ggml_view_3d(ctx, kv.v_l[il],
@@ -4986,6 +5422,7 @@ static struct ggml_tensor * llm_build_kv(
4986
5422
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4987
5423
 
4988
5424
  struct ggml_tensor * cur;
5425
+
4989
5426
  cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4990
5427
  q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4991
5428
  cb(cur, "kqv_out", il);
@@ -4995,7 +5432,7 @@ static struct ggml_tensor * llm_build_kv(
4995
5432
 
4996
5433
  struct llm_build_context {
4997
5434
  const llama_model & model;
4998
- const llama_context & lctx;
5435
+ llama_context & lctx;
4999
5436
  const llama_hparams & hparams;
5000
5437
  const llama_cparams & cparams;
5001
5438
  const llama_batch & batch;
@@ -5070,10 +5507,10 @@ struct llm_build_context {
5070
5507
  norm_eps (hparams.f_norm_eps),
5071
5508
  norm_rms_eps (hparams.f_norm_rms_eps),
5072
5509
  n_tokens (batch.n_tokens),
5073
- n_kv (worst_case ? n_ctx : kv_self.n),
5074
- kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5510
+ n_kv (worst_case ? kv_self.size : kv_self.n),
5511
+ kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
5075
5512
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5076
- pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
5513
+ pooling_type (cparams.pooling_type),
5077
5514
  rope_type (hparams.rope_type),
5078
5515
  cb (cb),
5079
5516
  buf_compute_meta (lctx.buf_compute_meta) {
@@ -5088,6 +5525,18 @@ struct llm_build_context {
5088
5525
  };
5089
5526
 
5090
5527
  ctx0 = ggml_init(params);
5528
+
5529
+ lctx.inp_tokens = nullptr;
5530
+ lctx.inp_embd = nullptr;
5531
+ lctx.inp_pos = nullptr;
5532
+ lctx.inp_KQ_mask = nullptr;
5533
+ lctx.inp_KQ_pos = nullptr;
5534
+ lctx.inp_K_shift = nullptr;
5535
+ lctx.inp_mean = nullptr;
5536
+ lctx.inp_cls = nullptr;
5537
+ lctx.inp_s_copy = nullptr;
5538
+ lctx.inp_s_mask = nullptr;
5539
+ lctx.inp_s_seq = nullptr;
5091
5540
  }
5092
5541
 
5093
5542
  void free() {
@@ -5100,6 +5549,12 @@ struct llm_build_context {
5100
5549
  struct ggml_cgraph * build_k_shift() {
5101
5550
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5102
5551
 
5552
+ GGML_ASSERT(kv_self.size == n_ctx);
5553
+
5554
+ lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
5555
+ cb(lctx.inp_K_shift, "K_shift", -1);
5556
+ ggml_set_input(lctx.inp_K_shift);
5557
+
5103
5558
  for (int il = 0; il < n_layer; ++il) {
5104
5559
  struct ggml_tensor * tmp =
5105
5560
  // we rotate only the first n_rot dimensions
@@ -5118,6 +5573,29 @@ struct llm_build_context {
5118
5573
  return gf;
5119
5574
  }
5120
5575
 
5576
+ struct ggml_cgraph * build_s_copy() {
5577
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5578
+
5579
+ GGML_ASSERT(kv_self.recurrent);
5580
+
5581
+ struct ggml_tensor * state_copy = build_inp_s_copy();
5582
+
5583
+ for (int il = 0; il < n_layer; ++il) {
5584
+ struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
5585
+ struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
5586
+
5587
+ conv_states = ggml_get_rows(ctx0, conv_states, state_copy);
5588
+ ssm_states = ggml_get_rows(ctx0, ssm_states, state_copy);
5589
+
5590
+ // TODO: name the intermediate tensors with cb()
5591
+
5592
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_states, kv_self.k_l[il]));
5593
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, ssm_states, kv_self.v_l[il]));
5594
+ }
5595
+
5596
+ return gf;
5597
+ }
5598
+
5121
5599
  struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5122
5600
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5123
5601
 
@@ -5167,6 +5645,66 @@ struct llm_build_context {
5167
5645
  return gf;
5168
5646
  }
5169
5647
 
5648
+ struct ggml_tensor * build_inp_pos() {
5649
+ lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5650
+ cb(lctx.inp_pos, "inp_pos", -1);
5651
+ ggml_set_input(lctx.inp_pos);
5652
+ return lctx.inp_pos;
5653
+ }
5654
+
5655
+ struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
5656
+ if (causal) {
5657
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
5658
+ } else {
5659
+ lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
5660
+ }
5661
+ cb(lctx.inp_KQ_mask, "KQ_mask", -1);
5662
+ ggml_set_input(lctx.inp_KQ_mask);
5663
+ return lctx.inp_KQ_mask;
5664
+ }
5665
+
5666
+ struct ggml_tensor * build_inp_KQ_pos() {
5667
+ lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
5668
+ cb(lctx.inp_KQ_pos, "KQ_pos", -1);
5669
+ ggml_set_input(lctx.inp_KQ_pos);
5670
+ return lctx.inp_KQ_pos;
5671
+ }
5672
+
5673
+ struct ggml_tensor * build_inp_mean() {
5674
+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
5675
+ cb(lctx.inp_mean, "inp_mean", -1);
5676
+ ggml_set_input(lctx.inp_mean);
5677
+ return lctx.inp_mean;
5678
+ }
5679
+
5680
+ struct ggml_tensor * build_inp_cls() {
5681
+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5682
+ cb(lctx.inp_cls, "inp_cls", -1);
5683
+ ggml_set_input(lctx.inp_cls);
5684
+ return lctx.inp_cls;
5685
+ }
5686
+
5687
+ struct ggml_tensor * build_inp_s_copy() {
5688
+ lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, kv_self.size);
5689
+ cb(lctx.inp_s_copy, "inp_s_copy", -1);
5690
+ ggml_set_input(lctx.inp_s_copy);
5691
+ return lctx.inp_s_copy;
5692
+ }
5693
+
5694
+ struct ggml_tensor * build_inp_s_mask() {
5695
+ lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
5696
+ cb(lctx.inp_s_mask, "inp_s_mask", -1);
5697
+ ggml_set_input(lctx.inp_s_mask);
5698
+ return lctx.inp_s_mask;
5699
+ }
5700
+
5701
+ struct ggml_tensor * build_inp_s_seq() {
5702
+ lctx.inp_s_seq = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
5703
+ cb(lctx.inp_s_seq, "inp_s_seq", -1);
5704
+ ggml_set_input(lctx.inp_s_seq);
5705
+ return lctx.inp_s_seq;
5706
+ }
5707
+
5170
5708
  struct ggml_cgraph * build_llama() {
5171
5709
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5172
5710
 
@@ -5177,16 +5715,13 @@ struct llm_build_context {
5177
5715
  struct ggml_tensor * cur;
5178
5716
  struct ggml_tensor * inpL;
5179
5717
 
5180
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5181
- cb(inpL, "inp_embd", -1);
5718
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5182
5719
 
5183
5720
  // inp_pos - contains the positions
5184
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5185
- cb(inp_pos, "inp_pos", -1);
5721
+ struct ggml_tensor * inp_pos = build_inp_pos();
5186
5722
 
5187
5723
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5188
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5189
- cb(KQ_mask, "KQ_mask", -1);
5724
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5190
5725
 
5191
5726
  for (int il = 0; il < n_layer; ++il) {
5192
5727
  struct ggml_tensor * inpSA = inpL;
@@ -5238,7 +5773,6 @@ struct llm_build_context {
5238
5773
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5239
5774
  model.layers[il].wo, model.layers[il].bo,
5240
5775
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5241
- cb(cur, "kqv_out", il);
5242
5776
  }
5243
5777
 
5244
5778
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -5356,20 +5890,16 @@ struct llm_build_context {
5356
5890
  struct ggml_tensor * cur;
5357
5891
  struct ggml_tensor * inpL;
5358
5892
 
5359
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5360
- cb(inpL, "inp_embd", -1);
5893
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5361
5894
 
5362
5895
  // inp_pos - contains the positions
5363
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5364
- cb(inp_pos, "inp_pos", -1);
5896
+ struct ggml_tensor * inp_pos = build_inp_pos();
5365
5897
 
5366
5898
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5367
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5368
- cb(KQ_mask, "KQ_mask", -1);
5899
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5369
5900
 
5370
5901
  // positions of the tokens in the KV cache
5371
- struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5372
- cb(KQ_pos, "KQ_pos", -1);
5902
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
5373
5903
 
5374
5904
  for (int il = 0; il < n_layer; ++il) {
5375
5905
  struct ggml_tensor * inpSA = inpL;
@@ -5417,7 +5947,6 @@ struct llm_build_context {
5417
5947
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5418
5948
  model.layers[il].wo, NULL,
5419
5949
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5420
- cb(cur, "kqv_out", il);
5421
5950
  }
5422
5951
 
5423
5952
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -5473,16 +6002,13 @@ struct llm_build_context {
5473
6002
  struct ggml_tensor * cur;
5474
6003
  struct ggml_tensor * inpL;
5475
6004
 
5476
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5477
- cb(inpL, "inp_embd", -1);
6005
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5478
6006
 
5479
6007
  // inp_pos - contains the positions
5480
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5481
- cb(inp_pos, "inp_pos", -1);
6008
+ struct ggml_tensor * inp_pos = build_inp_pos();
5482
6009
 
5483
6010
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5484
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5485
- cb(KQ_mask, "KQ_mask", -1);
6011
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5486
6012
 
5487
6013
  for (int il = 0; il < n_layer; ++il) {
5488
6014
  struct ggml_tensor * attn_norm;
@@ -5536,7 +6062,6 @@ struct llm_build_context {
5536
6062
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5537
6063
  model.layers[il].wo, NULL,
5538
6064
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5539
- cb(cur, "kqv_out", il);
5540
6065
  }
5541
6066
 
5542
6067
  struct ggml_tensor * ffn_inp = cur;
@@ -5587,21 +6112,17 @@ struct llm_build_context {
5587
6112
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5588
6113
 
5589
6114
  struct ggml_tensor * cur;
5590
- struct ggml_tensor * pos;
5591
6115
  struct ggml_tensor * inpL;
5592
6116
 
5593
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5594
- cb(inpL, "inp_embd", -1);
6117
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5595
6118
 
5596
6119
  // inp_pos - contains the positions
5597
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5598
- cb(inp_pos, "inp_pos", -1);
6120
+ struct ggml_tensor * inp_pos = build_inp_pos();
5599
6121
 
5600
6122
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5601
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5602
- cb(KQ_mask, "KQ_mask", -1);
6123
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5603
6124
 
5604
- pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
6125
+ struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
5605
6126
  cb(pos, "pos_embd", -1);
5606
6127
 
5607
6128
  inpL = ggml_add(ctx0, inpL, pos);
@@ -5635,7 +6156,6 @@ struct llm_build_context {
5635
6156
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5636
6157
  model.layers[il].wo, model.layers[il].bo,
5637
6158
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5638
- cb(cur, "kqv_out", il);
5639
6159
  }
5640
6160
 
5641
6161
  // add the input
@@ -5687,16 +6207,13 @@ struct llm_build_context {
5687
6207
  struct ggml_tensor * cur;
5688
6208
  struct ggml_tensor * inpL;
5689
6209
 
5690
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5691
- cb(inpL, "inp_embd", -1);
6210
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5692
6211
 
5693
6212
  // inp_pos - contains the positions
5694
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5695
- cb(inp_pos, "inp_pos", -1);
6213
+ struct ggml_tensor * inp_pos = build_inp_pos();
5696
6214
 
5697
6215
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5698
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5699
- cb(KQ_mask, "KQ_mask", -1);
6216
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5700
6217
 
5701
6218
  for (int il = 0; il < n_layer; ++il) {
5702
6219
  struct ggml_tensor * residual = inpL;
@@ -5836,7 +6353,6 @@ struct llm_build_context {
5836
6353
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5837
6354
  model.layers[il].wo, model.layers[il].bo,
5838
6355
  Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5839
- cb(cur, "kqv_out", il);
5840
6356
  }
5841
6357
 
5842
6358
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
@@ -5890,16 +6406,13 @@ struct llm_build_context {
5890
6406
  struct ggml_tensor * cur;
5891
6407
  struct ggml_tensor * inpL;
5892
6408
 
5893
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5894
- cb(inpL, "inp_embd", -1);
6409
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5895
6410
 
5896
6411
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5897
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5898
- cb(KQ_mask, "KQ_mask", -1);
6412
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
5899
6413
 
5900
6414
  // positions of the tokens in the KV cache
5901
- struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5902
- cb(KQ_pos, "KQ_pos", -1);
6415
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
5903
6416
 
5904
6417
  for (int il = 0; il < n_layer; ++il) {
5905
6418
  struct ggml_tensor * inpSA = inpL;
@@ -5929,7 +6442,6 @@ struct llm_build_context {
5929
6442
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5930
6443
  model.layers[il].wo, NULL,
5931
6444
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5932
- cb(cur, "kqv_out", il);
5933
6445
  }
5934
6446
 
5935
6447
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -5979,19 +6491,18 @@ struct llm_build_context {
5979
6491
 
5980
6492
  const int64_t n_embd_head = hparams.n_embd_head_v;
5981
6493
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6494
+
5982
6495
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5983
6496
 
5984
6497
  struct ggml_tensor * cur;
5985
6498
  struct ggml_tensor * inpL;
5986
6499
 
5987
- // get input vectors with right size
5988
- const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5989
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5990
- struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5991
- struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
6500
+ struct ggml_tensor * inp_pos = build_inp_pos();
6501
+ struct ggml_tensor * inp_mean = build_inp_mean();
6502
+ struct ggml_tensor * inp_cls = build_inp_cls();
5992
6503
 
5993
6504
  // construct input embeddings (token, type, position)
5994
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6505
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
5995
6506
 
5996
6507
  // token types are hardcoded to zero ("Sentence A")
5997
6508
  struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
@@ -6006,39 +6517,37 @@ struct llm_build_context {
6006
6517
  cb(inpL, "inp_norm", -1);
6007
6518
 
6008
6519
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6009
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6010
- cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
6520
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
6011
6521
 
6012
6522
  // iterate layers
6013
6523
  for (int il = 0; il < n_layer; ++il) {
6014
6524
  struct ggml_tensor * cur = inpL;
6015
6525
 
6526
+ struct ggml_tensor * Qcur;
6527
+ struct ggml_tensor * Kcur;
6528
+ struct ggml_tensor * Vcur;
6529
+
6016
6530
  // self-attention
6017
6531
  if (model.arch == LLM_ARCH_BERT) {
6018
- struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
6532
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
6019
6533
  cb(Qcur, "Qcur", il);
6020
6534
 
6021
- struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
6535
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
6022
6536
  cb(Kcur, "Kcur", il);
6023
6537
 
6024
- struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
6538
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
6025
6539
  cb(Vcur, "Vcur", il);
6026
6540
 
6027
- // seems like we just need to do this for Q?
6028
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6029
-
6030
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6031
- model.layers[il].wo, model.layers[il].bo,
6032
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6033
- cb(cur, "kqv_out", il);
6541
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6542
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6034
6543
  } else {
6035
6544
  // compute Q and K and RoPE them
6036
6545
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6037
6546
  cb(cur, "wqkv", il);
6038
6547
 
6039
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6040
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6041
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6548
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6549
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6550
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6042
6551
 
6043
6552
  cb(Qcur, "Qcur", il);
6044
6553
  cb(Kcur, "Kcur", il);
@@ -6057,12 +6566,40 @@ struct llm_build_context {
6057
6566
  ext_factor, attn_factor, beta_fast, beta_slow
6058
6567
  );
6059
6568
  cb(Kcur, "Kcur", il);
6569
+ }
6060
6570
 
6061
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6062
- model.layers[il].wo, model.layers[il].bo,
6063
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6064
- cb(cur, "kqv_out", il);
6571
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
6572
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
6573
+
6574
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
6575
+ cb(kq, "kq", il);
6576
+
6577
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
6578
+ cb(kq, "kq_soft_max_ext", il);
6579
+
6580
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
6581
+ cb(v, "v", il);
6582
+
6583
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
6584
+ cb(kqv, "kqv", il);
6585
+
6586
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
6587
+ cb(kqv_merged, "kqv_merged", il);
6588
+
6589
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
6590
+ cb(cur, "kqv_merged_cont", il);
6591
+
6592
+ ggml_build_forward_expand(gf, cur);
6593
+
6594
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
6595
+ if (model.layers[il].bo) {
6596
+ cb(cur, "kqv_wo", il);
6597
+ }
6598
+
6599
+ if (model.layers[il].bo) {
6600
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
6065
6601
  }
6602
+ cb(cur, "kqv_out", il);
6066
6603
 
6067
6604
  // re-add the layer input
6068
6605
  cur = ggml_add(ctx0, cur, inpL);
@@ -6103,16 +6640,29 @@ struct llm_build_context {
6103
6640
 
6104
6641
  // final output
6105
6642
  cur = inpL;
6643
+ cb(cur, "result_embd", -1);
6106
6644
 
6107
6645
  // pooling layer
6108
- if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
6109
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6110
- } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
6111
- cur = ggml_get_rows(ctx0, cur, inp_cls);
6112
- } else {
6113
- GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
6646
+ switch (pooling_type) {
6647
+ case LLAMA_POOLING_TYPE_NONE:
6648
+ {
6649
+ // nop
6650
+ } break;
6651
+ case LLAMA_POOLING_TYPE_MEAN:
6652
+ {
6653
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6654
+ cb(cur, "result_embd_pooled", -1);
6655
+ } break;
6656
+ case LLAMA_POOLING_TYPE_CLS:
6657
+ {
6658
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6659
+ cb(cur, "result_embd_pooled", -1);
6660
+ } break;
6661
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
6662
+ {
6663
+ GGML_ASSERT(false && "Invalid pooling type");
6664
+ } break;
6114
6665
  }
6115
- cb(cur, "result_embd", -1);
6116
6666
 
6117
6667
  ggml_build_forward_expand(gf, cur);
6118
6668
 
@@ -6129,16 +6679,13 @@ struct llm_build_context {
6129
6679
  struct ggml_tensor * cur;
6130
6680
  struct ggml_tensor * inpL;
6131
6681
 
6132
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6133
- cb(inpL, "inp_embd", -1);
6682
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6134
6683
 
6135
6684
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6136
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6137
- cb(KQ_mask, "KQ_mask", -1);
6685
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6138
6686
 
6139
6687
  // positions of the tokens in the KV cache
6140
- struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6141
- cb(KQ_pos, "KQ_pos", -1);
6688
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6142
6689
 
6143
6690
  inpL = llm_build_norm(ctx0, inpL, hparams,
6144
6691
  model.tok_norm,
@@ -6174,7 +6721,6 @@ struct llm_build_context {
6174
6721
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6175
6722
  model.layers[il].wo, model.layers[il].bo,
6176
6723
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6177
- cb(cur, "kqv_out", il);
6178
6724
  }
6179
6725
 
6180
6726
  // Add the input
@@ -6226,16 +6772,13 @@ struct llm_build_context {
6226
6772
  struct ggml_tensor * cur;
6227
6773
  struct ggml_tensor * inpL;
6228
6774
 
6229
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6230
- cb(inpL, "inp_embd", -1);
6775
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6231
6776
 
6232
6777
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6233
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6234
- cb(KQ_mask, "KQ_mask", -1);
6778
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6235
6779
 
6236
6780
  // positions of the tokens in the KV cache
6237
- struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6238
- cb(KQ_pos, "KQ_pos", -1);
6781
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6239
6782
 
6240
6783
  for (int il = 0; il < n_layer; ++il) {
6241
6784
  struct ggml_tensor * attn_norm;
@@ -6276,7 +6819,6 @@ struct llm_build_context {
6276
6819
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6277
6820
  model.layers[il].wo, model.layers[il].bo,
6278
6821
  Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6279
- cb(cur, "kqv_out", il);
6280
6822
  }
6281
6823
 
6282
6824
  // Add the input
@@ -6331,16 +6873,13 @@ struct llm_build_context {
6331
6873
  struct ggml_tensor * cur;
6332
6874
  struct ggml_tensor * inpL;
6333
6875
 
6334
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6335
- cb(inpL, "inp_embd", -1);
6876
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6336
6877
 
6337
6878
  // inp_pos - contains the positions
6338
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6339
- cb(inp_pos, "inp_pos", -1);
6879
+ struct ggml_tensor * inp_pos = build_inp_pos();
6340
6880
 
6341
6881
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6342
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6343
- cb(KQ_mask, "KQ_mask", -1);
6882
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6344
6883
 
6345
6884
  for (int il = 0; il < n_layer; ++il) {
6346
6885
  struct ggml_tensor * inpSA = inpL;
@@ -6393,7 +6932,6 @@ struct llm_build_context {
6393
6932
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6394
6933
  model.layers[il].wo, NULL,
6395
6934
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6396
- cb(cur, "kqv_out", il);
6397
6935
  }
6398
6936
 
6399
6937
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -6449,16 +6987,13 @@ struct llm_build_context {
6449
6987
  struct ggml_tensor * cur;
6450
6988
  struct ggml_tensor * inpL;
6451
6989
 
6452
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6453
- cb(inpL, "inp_embd", -1);
6990
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6454
6991
 
6455
6992
  // inp_pos - contains the positions
6456
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6457
- cb(inp_pos, "inp_pos", -1);
6993
+ struct ggml_tensor * inp_pos = build_inp_pos();
6458
6994
 
6459
6995
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6460
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6461
- cb(KQ_mask, "KQ_mask", -1);
6996
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6462
6997
 
6463
6998
  for (int il = 0; il < n_layer; ++il) {
6464
6999
  struct ggml_tensor * inpSA = inpL;
@@ -6503,7 +7038,6 @@ struct llm_build_context {
6503
7038
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6504
7039
  model.layers[il].wo, NULL,
6505
7040
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6506
- cb(cur, "kqv_out", il);
6507
7041
  }
6508
7042
 
6509
7043
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -6558,16 +7092,13 @@ struct llm_build_context {
6558
7092
  struct ggml_tensor * cur;
6559
7093
  struct ggml_tensor * inpL;
6560
7094
 
6561
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6562
- cb(inpL, "inp_embd", -1);
7095
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6563
7096
 
6564
7097
  // inp_pos - contains the positions
6565
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6566
- cb(inp_pos, "inp_pos", -1);
7098
+ struct ggml_tensor * inp_pos = build_inp_pos();
6567
7099
 
6568
7100
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6569
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6570
- cb(KQ_mask, "KQ_mask", -1);
7101
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6571
7102
 
6572
7103
  for (int il = 0; il < n_layer; ++il) {
6573
7104
  struct ggml_tensor * inpSA = inpL;
@@ -6619,7 +7150,6 @@ struct llm_build_context {
6619
7150
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6620
7151
  model.layers[il].wo, model.layers[il].bo,
6621
7152
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6622
- cb(cur, "kqv_out", il);
6623
7153
  }
6624
7154
 
6625
7155
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -6674,16 +7204,13 @@ struct llm_build_context {
6674
7204
  struct ggml_tensor * ffn_output;
6675
7205
  struct ggml_tensor * inpL;
6676
7206
 
6677
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6678
- cb(inpL, "inp_embd", -1);
7207
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6679
7208
 
6680
7209
  // inp_pos - contains the positions
6681
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6682
- cb(inp_pos, "inp_pos", -1);
7210
+ struct ggml_tensor * inp_pos = build_inp_pos();
6683
7211
 
6684
7212
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6685
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6686
- cb(KQ_mask, "KQ_mask", -1);
7213
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6687
7214
 
6688
7215
  for (int il = 0; il < n_layer; ++il) {
6689
7216
  attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
@@ -6741,7 +7268,6 @@ struct llm_build_context {
6741
7268
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6742
7269
  model.layers[il].wo, model.layers[il].bo,
6743
7270
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6744
- cb(cur, "kqv_out", il);
6745
7271
  }
6746
7272
 
6747
7273
  // FF
@@ -6791,16 +7317,13 @@ struct llm_build_context {
6791
7317
  struct ggml_tensor * cur;
6792
7318
  struct ggml_tensor * inpL;
6793
7319
 
6794
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6795
- cb(inpL, "inp_embd", -1);
7320
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6796
7321
 
6797
7322
  // inp_pos - contains the positions
6798
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6799
- cb(inp_pos, "inp_pos", -1);
7323
+ struct ggml_tensor * inp_pos = build_inp_pos();
6800
7324
 
6801
7325
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6802
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6803
- cb(KQ_mask, "KQ_mask", -1);
7326
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6804
7327
 
6805
7328
  for (int il = 0; il < n_layer; ++il) {
6806
7329
 
@@ -6839,7 +7362,6 @@ struct llm_build_context {
6839
7362
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6840
7363
  model.layers[il].wo, NULL,
6841
7364
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6842
- cb(cur, "kqv_out", il);
6843
7365
  }
6844
7366
  struct ggml_tensor * sa_out = cur;
6845
7367
 
@@ -6893,16 +7415,13 @@ struct llm_build_context {
6893
7415
  struct ggml_tensor * pos;
6894
7416
  struct ggml_tensor * inpL;
6895
7417
 
6896
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6897
- cb(inpL, "inp_embd", -1);
7418
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6898
7419
 
6899
7420
  // inp_pos - contains the positions
6900
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6901
- cb(inp_pos, "inp_pos", -1);
7421
+ struct ggml_tensor * inp_pos = build_inp_pos();
6902
7422
 
6903
7423
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6904
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6905
- cb(KQ_mask, "KQ_mask", -1);
7424
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6906
7425
 
6907
7426
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
6908
7427
  cb(pos, "pos_embd", -1);
@@ -6938,7 +7457,6 @@ struct llm_build_context {
6938
7457
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6939
7458
  model.layers[il].wo, model.layers[il].bo,
6940
7459
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6941
- cb(cur, "kqv_out", il);
6942
7460
  }
6943
7461
 
6944
7462
  // add the input
@@ -6991,16 +7509,13 @@ struct llm_build_context {
6991
7509
  struct ggml_tensor * cur;
6992
7510
  struct ggml_tensor * inpL;
6993
7511
 
6994
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6995
- cb(inpL, "inp_embd", -1);
7512
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6996
7513
 
6997
7514
  // inp_pos - contains the positions
6998
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6999
- cb(inp_pos, "inp_pos", -1);
7515
+ struct ggml_tensor * inp_pos = build_inp_pos();
7000
7516
 
7001
7517
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7002
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7003
- cb(KQ_mask, "KQ_mask", -1);
7518
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7004
7519
 
7005
7520
  for (int il = 0; il < n_layer; ++il) {
7006
7521
  cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7042,7 +7557,6 @@ struct llm_build_context {
7042
7557
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7043
7558
  model.layers[il].wo, model.layers[il].bo,
7044
7559
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7045
- cb(cur, "kqv_out", il);
7046
7560
  }
7047
7561
 
7048
7562
  // add the input
@@ -7094,16 +7608,13 @@ struct llm_build_context {
7094
7608
  struct ggml_tensor * cur;
7095
7609
  struct ggml_tensor * inpL;
7096
7610
 
7097
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7098
- cb(inpL, "inp_embd", -1);
7611
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7099
7612
 
7100
7613
  // inp_pos - contains the positions
7101
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7102
- cb(inp_pos, "inp_pos", -1);
7614
+ struct ggml_tensor * inp_pos = build_inp_pos();
7103
7615
 
7104
7616
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7105
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7106
- cb(KQ_mask, "KQ_mask", -1);
7617
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7107
7618
 
7108
7619
  for (int il = 0; il < n_layer; ++il) {
7109
7620
  struct ggml_tensor * inpSA = inpL;
@@ -7155,7 +7666,6 @@ struct llm_build_context {
7155
7666
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7156
7667
  model.layers[il].wo, NULL,
7157
7668
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7158
- cb(cur, "kqv_out", il);
7159
7669
  }
7160
7670
 
7161
7671
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -7208,16 +7718,13 @@ struct llm_build_context {
7208
7718
  struct ggml_tensor * cur;
7209
7719
  struct ggml_tensor * inpL;
7210
7720
 
7211
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7212
- cb(inpL, "inp_embd", -1);
7721
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7213
7722
 
7214
7723
  // inp_pos - contains the positions
7215
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7216
- cb(inp_pos, "inp_pos", -1);
7724
+ struct ggml_tensor * inp_pos = build_inp_pos();
7217
7725
 
7218
7726
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7219
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7220
- cb(KQ_mask, "KQ_mask", -1);
7727
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7221
7728
 
7222
7729
  for (int il = 0; il < n_layer; ++il) {
7223
7730
  struct ggml_tensor * inpSA = inpL;
@@ -7269,7 +7776,6 @@ struct llm_build_context {
7269
7776
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7270
7777
  model.layers[il].wo, model.layers[il].bo,
7271
7778
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7272
- cb(cur, "kqv_out", il);
7273
7779
  }
7274
7780
 
7275
7781
  struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -7331,20 +7837,17 @@ struct llm_build_context {
7331
7837
  struct ggml_tensor * cur;
7332
7838
  struct ggml_tensor * inpL;
7333
7839
 
7334
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7335
- cb(inpL, "inp_embd", -1);
7840
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7336
7841
 
7337
7842
  // scale the input embeddings
7338
7843
  inpL = ggml_scale(ctx0, inpL, scale_embd);
7339
7844
  cb(inpL, "inp_scaled", -1);
7340
7845
 
7341
7846
  // inp_pos - contains the positions
7342
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7343
- cb(inp_pos, "inp_pos", -1);
7847
+ struct ggml_tensor * inp_pos = build_inp_pos();
7344
7848
 
7345
7849
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7346
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7347
- cb(KQ_mask, "KQ_mask", -1);
7850
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7348
7851
 
7349
7852
  for (int il = 0; il < n_layer; ++il) {
7350
7853
  struct ggml_tensor * inpSA = inpL;
@@ -7396,7 +7899,6 @@ struct llm_build_context {
7396
7899
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7397
7900
  model.layers[il].wo, model.layers[il].bo,
7398
7901
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7399
- cb(cur, "kqv_out", il);
7400
7902
  }
7401
7903
 
7402
7904
  // scale_res - scale the hidden states for residual connection
@@ -7463,22 +7965,18 @@ struct llm_build_context {
7463
7965
  struct ggml_tensor * cur;
7464
7966
  struct ggml_tensor * inpL;
7465
7967
 
7466
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7467
- cb(inpL, "inp_embd", -1);
7968
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7468
7969
 
7469
7970
  inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
7470
7971
  cb(inpL, "inp_scaled", -1);
7471
7972
 
7472
7973
  // inp_pos - contains the positions
7473
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7474
- cb(inp_pos, "inp_pos", -1);
7974
+ struct ggml_tensor * inp_pos = build_inp_pos();
7475
7975
 
7476
7976
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7477
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7478
- cb(KQ_mask, "KQ_mask", -1);
7977
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7479
7978
 
7480
7979
  for (int il = 0; il < n_layer; ++il) {
7481
-
7482
7980
  // norm
7483
7981
  cur = llm_build_norm(ctx0, inpL, hparams,
7484
7982
  model.layers[il].attn_norm, NULL,
@@ -7515,7 +8013,6 @@ struct llm_build_context {
7515
8013
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7516
8014
  model.layers[il].wo, NULL,
7517
8015
  Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7518
- cb(cur, "kqv_out", il);
7519
8016
  }
7520
8017
 
7521
8018
  struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
@@ -7559,6 +8056,255 @@ struct llm_build_context {
7559
8056
 
7560
8057
  return gf;
7561
8058
  }
8059
+
8060
+ struct ggml_cgraph * build_starcoder2() {
8061
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8062
+
8063
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8064
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8065
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8066
+
8067
+ struct ggml_tensor * cur;
8068
+ struct ggml_tensor * inpL;
8069
+
8070
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8071
+
8072
+ // inp_pos - contains the positions
8073
+ struct ggml_tensor * inp_pos = build_inp_pos();
8074
+
8075
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8076
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8077
+
8078
+ for (int il = 0; il < n_layer; ++il) {
8079
+ struct ggml_tensor * inpSA = inpL;
8080
+
8081
+ // norm
8082
+ cur = llm_build_norm(ctx0, inpL, hparams,
8083
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
8084
+ LLM_NORM, cb, il);
8085
+ cb(cur, "attn_norm", il);
8086
+
8087
+ // self-attention
8088
+ {
8089
+ // compute Q and K and RoPE them
8090
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8091
+ cb(Qcur, "Qcur", il);
8092
+ if (model.layers[il].bq) {
8093
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8094
+ cb(Qcur, "Qcur", il);
8095
+ }
8096
+
8097
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8098
+ cb(Kcur, "Kcur", il);
8099
+ if (model.layers[il].bk) {
8100
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8101
+ cb(Kcur, "Kcur", il);
8102
+ }
8103
+
8104
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8105
+ cb(Vcur, "Vcur", il);
8106
+ if (model.layers[il].bv) {
8107
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8108
+ cb(Vcur, "Vcur", il);
8109
+ }
8110
+
8111
+ Qcur = ggml_rope_custom(
8112
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8113
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8114
+ ext_factor, attn_factor, beta_fast, beta_slow
8115
+ );
8116
+ cb(Qcur, "Qcur", il);
8117
+
8118
+ Kcur = ggml_rope_custom(
8119
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8120
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8121
+ ext_factor, attn_factor, beta_fast, beta_slow
8122
+ );
8123
+ cb(Kcur, "Kcur", il);
8124
+
8125
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8126
+ model.layers[il].wo, model.layers[il].bo,
8127
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8128
+ cb(cur, "kqv_out", il);
8129
+ }
8130
+
8131
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8132
+ cb(ffn_inp, "ffn_inp", il);
8133
+
8134
+ // feed-forward network
8135
+
8136
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8137
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
8138
+ LLM_NORM, cb, il);
8139
+ cb(cur, "ffn_norm", il);
8140
+
8141
+ cur = llm_build_ffn(ctx0, cur,
8142
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8143
+ NULL, NULL,
8144
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8145
+ NULL,
8146
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8147
+ cb(cur, "ffn_out", il);
8148
+ cur = ggml_add(ctx0, cur, ffn_inp);
8149
+ cb(cur, "l_out", il);
8150
+
8151
+ // input for next layer
8152
+ inpL = cur;
8153
+ }
8154
+
8155
+ cur = inpL;
8156
+
8157
+ cur = llm_build_norm(ctx0, cur, hparams,
8158
+ model.output_norm, model.output_norm_b,
8159
+ LLM_NORM, cb, -1);
8160
+ cb(cur, "result_norm", -1);
8161
+
8162
+ // lm_head
8163
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8164
+ cb(cur, "result_output", -1);
8165
+
8166
+ ggml_build_forward_expand(gf, cur);
8167
+
8168
+ return gf;
8169
+ }
8170
+
8171
+ struct ggml_cgraph * build_mamba() {
8172
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8173
+
8174
+ const int64_t d_model = n_embd;
8175
+ const int64_t d_conv = hparams.ssm_d_conv;
8176
+ const int64_t d_inner = hparams.ssm_d_inner;
8177
+ GGML_ASSERT(2 * d_model == d_inner);
8178
+ const int64_t d_state = hparams.ssm_d_state;
8179
+ const int64_t dt_rank = hparams.ssm_dt_rank;
8180
+
8181
+ struct ggml_tensor * cur;
8182
+ struct ggml_tensor * inpL;
8183
+
8184
+ // {n_embd, n_tokens}
8185
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8186
+
8187
+ struct ggml_tensor * state_mask = build_inp_s_mask();
8188
+ struct ggml_tensor * state_seq = build_inp_s_seq();
8189
+
8190
+ for (int il = 0; il < n_layer; ++il) {
8191
+ // (ab)using the KV cache to store the states
8192
+ struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
8193
+ struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
8194
+
8195
+ // clear states of sequences which are starting at the beginning of this batch
8196
+ {
8197
+ conv_states = ggml_mul(ctx0,
8198
+ ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]),
8199
+ state_mask);
8200
+ ssm_states = ggml_mul(ctx0,
8201
+ ggml_view_2d(ctx0, ssm_states, ssm_states->ne[0], n_kv, ssm_states->nb[1], kv_head*ssm_states->nb[1]),
8202
+ state_mask);
8203
+ }
8204
+
8205
+ conv_states = ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv);
8206
+ ssm_states = ggml_reshape_3d(ctx0, ssm_states, d_state, d_inner, n_kv);
8207
+
8208
+ // norm
8209
+ cur = llm_build_norm(ctx0, inpL, hparams,
8210
+ model.layers[il].attn_norm, NULL,
8211
+ LLM_NORM_RMS, cb, il);
8212
+ cb(cur, "attn_norm", il);
8213
+
8214
+ // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
8215
+ struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
8216
+ // split the above in two
8217
+ // => {d_inner, n_tokens}
8218
+ struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
8219
+ struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
8220
+
8221
+ // conv
8222
+ {
8223
+ // Custom operator which is needed only to ease simultaneous sequence processing.
8224
+ // For a single sequence, the equivalent is to concatenate the columns of conv_states and x,
8225
+ // then make a self-overlapping view of that over d_conv columns at each stride in the 3rd dimension,
8226
+ // then element-wise multiply that with the conv1d weigth,
8227
+ // then sum the elements of each row,
8228
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
8229
+ // then permute away the ne[0] dimension,
8230
+ // and then you're left with the resulting x tensor.
8231
+ // The new conv_states is the last (d_conv - 1) columns
8232
+ // of the last 3rd dimensional "layer" of the self-overlapping view.
8233
+ // For simultaneous sequences, it's more complicated.
8234
+ struct ggml_tensor * x_conv = ggml_ssm_conv(ctx0, conv_states, x, model.layers[il].ssm_conv1d, state_seq);
8235
+
8236
+ // store last (d_conv - 1) columns of the conv_state part of x_conv back into the KV cache
8237
+ ggml_build_forward_expand(gf,
8238
+ ggml_cpy(ctx0,
8239
+ ggml_view_2d(ctx0, x_conv, d_conv - 1, d_inner*n_kv, d_conv*ggml_element_size(x_conv), (1+d_inner*n_tokens)*ggml_element_size(x_conv)),
8240
+ ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(x_conv))));
8241
+
8242
+ // extract x from x_conv
8243
+ x = ggml_view_2d(ctx0, x_conv, d_inner, n_tokens, d_inner*ggml_element_size(x_conv), 0);
8244
+
8245
+ // bias
8246
+ x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
8247
+
8248
+ x = ggml_silu(ctx0, x);
8249
+ }
8250
+
8251
+ // ssm
8252
+ {
8253
+ // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
8254
+ struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
8255
+ // split
8256
+ struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
8257
+ struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
8258
+ struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
8259
+
8260
+ // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
8261
+ dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
8262
+ dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
8263
+
8264
+ // Custom operator to optimize the parallel associative scan
8265
+ // as described in the Annex D of the Mamba paper.
8266
+ // => {d_inner, n_tokens} and {d_state, d_inner, n_kv} combined,
8267
+ // because only a single tensor can be returned.
8268
+ struct ggml_tensor * y_ssm_states = ggml_ssm_scan(ctx0, ssm_states, x, dt, model.layers[il].ssm_a, B, C, state_seq);
8269
+
8270
+ // store last states (the second part of y_ssm_states)
8271
+ ggml_build_forward_expand(gf,
8272
+ ggml_cpy(ctx0,
8273
+ ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*ggml_element_size(y_ssm_states)),
8274
+ ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_head*d_state*d_inner*ggml_element_size(ssm_states))));
8275
+
8276
+ struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
8277
+
8278
+ // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
8279
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
8280
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
8281
+
8282
+ // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
8283
+ cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
8284
+ }
8285
+
8286
+ // residual
8287
+ cur = ggml_add(ctx0, cur, inpL);
8288
+ cb(cur, "l_out", il);
8289
+
8290
+ // input for next layer
8291
+ inpL = cur;
8292
+ }
8293
+
8294
+ // final rmsnorm
8295
+ cur = llm_build_norm(ctx0, inpL, hparams,
8296
+ model.output_norm, NULL,
8297
+ LLM_NORM_RMS, cb, -1);
8298
+ cb(cur, "result_norm", -1);
8299
+
8300
+ // lm_head
8301
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8302
+ cb(cur, "result_output", -1);
8303
+
8304
+ ggml_build_forward_expand(gf, cur);
8305
+
8306
+ return gf;
8307
+ }
7562
8308
  };
7563
8309
 
7564
8310
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -7595,6 +8341,23 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
7595
8341
  return result;
7596
8342
  }
7597
8343
 
8344
+ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
8345
+ llama_batch dummy;
8346
+ dummy.n_tokens = 0;
8347
+
8348
+ llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
8349
+
8350
+ struct llm_build_context llm(lctx, dummy, cb, false);
8351
+
8352
+ llm.init();
8353
+
8354
+ struct ggml_cgraph * result = llm.build_s_copy();
8355
+
8356
+ llm.free();
8357
+
8358
+ return result;
8359
+ }
8360
+
7598
8361
  static struct ggml_cgraph * llama_build_graph(
7599
8362
  llama_context & lctx,
7600
8363
  const llama_batch & batch,
@@ -7612,7 +8375,18 @@ static struct ggml_cgraph * llama_build_graph(
7612
8375
  if (!lctx.cparams.offload_kqv) {
7613
8376
  if (strcmp(name, "kqv_merged_cont") == 0) {
7614
8377
  // all nodes between the KV store and the attention output are run on the CPU
7615
- ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
8378
+ ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu);
8379
+ }
8380
+ }
8381
+
8382
+ // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
8383
+ // to fix this, we assign the norm layer manually to the backend of its layer
8384
+ if (il != -1 && strcmp(name, "norm") == 0) {
8385
+ for (auto * backend : lctx.backends) {
8386
+ if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
8387
+ ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
8388
+ break;
8389
+ }
7616
8390
  }
7617
8391
  }
7618
8392
  };
@@ -7705,6 +8479,14 @@ static struct ggml_cgraph * llama_build_graph(
7705
8479
  {
7706
8480
  result = llm.build_gemma();
7707
8481
  } break;
8482
+ case LLM_ARCH_STARCODER2:
8483
+ {
8484
+ result = llm.build_starcoder2();
8485
+ } break;
8486
+ case LLM_ARCH_MAMBA:
8487
+ {
8488
+ result = llm.build_mamba();
8489
+ } break;
7708
8490
  default:
7709
8491
  GGML_ASSERT(false);
7710
8492
  }
@@ -7715,19 +8497,29 @@ static struct ggml_cgraph * llama_build_graph(
7715
8497
  }
7716
8498
 
7717
8499
  static void llama_set_k_shift(llama_context & lctx) {
7718
- const auto & cparams = lctx.cparams;
7719
-
7720
- const int64_t n_ctx = cparams.n_ctx;
8500
+ const int64_t kv_size = lctx.kv_self.size;
7721
8501
 
7722
8502
  assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7723
8503
 
7724
8504
  int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7725
8505
 
7726
- for (int i = 0; i < n_ctx; ++i) {
8506
+ for (int i = 0; i < kv_size; ++i) {
7727
8507
  data[i] = lctx.kv_self.cells[i].delta;
7728
8508
  }
7729
8509
  }
7730
8510
 
8511
+ static void llama_set_s_copy(llama_context & lctx) {
8512
+ const int64_t kv_size = lctx.kv_self.size;
8513
+
8514
+ assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
8515
+
8516
+ int32_t * data = (int32_t *) lctx.inp_s_copy->data;
8517
+
8518
+ for (int i = 0; i < kv_size; ++i) {
8519
+ data[i] = lctx.kv_self.cells[i].src;
8520
+ }
8521
+ }
8522
+
7731
8523
  static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7732
8524
  //
7733
8525
  // set input data
@@ -7750,34 +8542,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7750
8542
  ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7751
8543
  }
7752
8544
 
7753
- if (batch.pos) {
8545
+ if (batch.pos && lctx.inp_pos) {
7754
8546
  const int64_t n_tokens = batch.n_tokens;
7755
8547
 
7756
8548
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7757
8549
  }
7758
8550
 
7759
- {
7760
- const int64_t n_kv = kv_self.n;
7761
- const int64_t n_tokens = batch.n_tokens;
8551
+ GGML_ASSERT(
8552
+ (hparams.causal_attn || !cparams.causal_attn) &&
8553
+ "non-causal attention with generative models is not supported"
8554
+ );
7762
8555
 
7763
- assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
8556
+ if (lctx.inp_KQ_mask) {
8557
+ // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
8558
+ if (cparams.causal_attn) {
8559
+ const int64_t n_kv = kv_self.n;
8560
+ const int64_t n_tokens = batch.n_tokens;
7764
8561
 
7765
- float * data = (float *) lctx.inp_KQ_mask->data;
8562
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7766
8563
 
7767
- for (int h = 0; h < 1; ++h) {
7768
- for (int j = 0; j < n_tokens; ++j) {
7769
- const llama_pos pos = batch.pos[j];
7770
- const llama_seq_id seq_id = batch.seq_id[j][0];
8564
+ float * data = (float *) lctx.inp_KQ_mask->data;
7771
8565
 
7772
- for (int i = 0; i < n_kv; ++i) {
7773
- float f;
7774
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7775
- (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
7776
- f = -INFINITY;
7777
- } else {
7778
- f = 0;
8566
+ // For causal attention, use only the previous KV cells
8567
+ // of the correct sequence for each token of the batch.
8568
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
8569
+ for (int h = 0; h < 1; ++h) {
8570
+ for (int j = 0; j < n_tokens; ++j) {
8571
+ const llama_pos pos = batch.pos[j];
8572
+ const llama_seq_id seq_id = batch.seq_id[j][0];
8573
+
8574
+ for (int i = 0; i < n_kv; ++i) {
8575
+ float f;
8576
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
8577
+ f = -INFINITY;
8578
+ } else {
8579
+ f = 0.0f;
8580
+ }
8581
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
8582
+ }
8583
+ }
8584
+ }
8585
+ } else {
8586
+ // when using kv cache, the mask needs to match the kv cache size
8587
+ const int64_t n_tokens = batch.n_tokens;
8588
+ const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens;
8589
+
8590
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
8591
+
8592
+ float * data = (float *) lctx.inp_KQ_mask->data;
8593
+
8594
+ for (int h = 0; h < 1; ++h) {
8595
+ for (int j = 0; j < n_tokens; ++j) {
8596
+ const llama_seq_id seq_id = batch.seq_id[j][0];
8597
+
8598
+ for (int i = 0; i < n_tokens; ++i) {
8599
+ float f = -INFINITY;
8600
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
8601
+ if (batch.seq_id[i][s] == seq_id) {
8602
+ f = 0.0f;
8603
+ break;
8604
+ }
8605
+ }
8606
+
8607
+ data[h*(n_tokens*n_tokens) + j*n_stride + i] = f;
8608
+ }
8609
+
8610
+ for (int i = n_tokens; i < n_stride; ++i) {
8611
+ data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY;
7779
8612
  }
7780
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7781
8613
  }
7782
8614
  }
7783
8615
  }
@@ -7786,7 +8618,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7786
8618
  if (hparams.need_kq_pos) {
7787
8619
  const int64_t n_kv = kv_self.n;
7788
8620
 
7789
- assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
8621
+ GGML_ASSERT(lctx.inp_KQ_pos);
8622
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
7790
8623
 
7791
8624
  float * data = (float *) lctx.inp_KQ_pos->data;
7792
8625
 
@@ -7795,17 +8628,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7795
8628
  }
7796
8629
  }
7797
8630
 
7798
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
8631
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7799
8632
  const int64_t n_tokens = batch.n_tokens;
7800
8633
 
8634
+ GGML_ASSERT(lctx.inp_mean);
7801
8635
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7802
- float * data = (float *) lctx.inp_mean->data;
7803
8636
 
8637
+ float * data = (float *) lctx.inp_mean->data;
7804
8638
  memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7805
8639
 
7806
8640
  std::vector<uint64_t> sum(n_tokens, 0);
7807
8641
  for (int i = 0; i < n_tokens; ++i) {
7808
8642
  const llama_seq_id seq_id = batch.seq_id[i][0];
8643
+
8644
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
8645
+
7809
8646
  sum[seq_id] += 1;
7810
8647
  }
7811
8648
 
@@ -7823,20 +8660,73 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7823
8660
  }
7824
8661
  }
7825
8662
 
7826
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
8663
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7827
8664
  const int64_t n_tokens = batch.n_tokens;
7828
8665
 
8666
+ GGML_ASSERT(lctx.inp_cls);
7829
8667
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
8668
+
7830
8669
  uint32_t * data = (uint32_t *) lctx.inp_cls->data;
8670
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
7831
8671
 
7832
8672
  for (int i = 0; i < n_tokens; ++i) {
7833
8673
  const llama_seq_id seq_id = batch.seq_id[i][0];
7834
- const llama_pos pos = batch.pos[i];
8674
+ const llama_pos pos = batch.pos[i];
8675
+
8676
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
8677
+
7835
8678
  if (pos == 0) {
7836
8679
  data[seq_id] = i;
7837
8680
  }
7838
8681
  }
7839
8682
  }
8683
+
8684
+ if (kv_self.recurrent) {
8685
+ const int64_t n_kv = kv_self.n;
8686
+
8687
+ if (lctx.inp_s_mask) {
8688
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
8689
+ float * data = (float *) lctx.inp_s_mask->data;
8690
+
8691
+ // states which are not affected by the current batch are left untouched
8692
+ for (int i = 0; i < n_kv; ++i) {
8693
+ llama_seq_id seq_id = i + lctx.kv_self.head;
8694
+ llama_kv_cell & kv_cell = lctx.kv_self.cells[seq_id];
8695
+ bool has_self_seq = kv_cell.has_seq_id(seq_id);
8696
+
8697
+ data[i] = (float) has_self_seq;
8698
+
8699
+ // ensure current sequences will be kept
8700
+ if (!has_self_seq && kv_cell.pos >= 0) {
8701
+ kv_cell.seq_id.insert(seq_id);
8702
+ }
8703
+ }
8704
+ }
8705
+ // For Mamba (and other recurrent architectures),
8706
+ // update the correct state(s)/sequence(s) for each token of the batch.
8707
+ // Like with the KQ_mask, if a token in the batch has multiple sequences,
8708
+ // they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv).
8709
+ if (lctx.inp_s_seq) {
8710
+ const int64_t n_tokens = batch.n_tokens;
8711
+
8712
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer));
8713
+ int32_t * data = (int32_t *) lctx.inp_s_seq->data;
8714
+
8715
+ for (int j = 0; j < n_tokens; ++j) {
8716
+ const int32_t n_seq = batch.n_seq_id[j];
8717
+ GGML_ASSERT(0 < n_seq); // a token should be part of at least 1 sequence
8718
+
8719
+ for (int i = 0; i < n_kv; ++i) {
8720
+ if (i < n_seq) {
8721
+ // for this type of model, the head is the minimum seq_id of the batch
8722
+ data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head;
8723
+ } else {
8724
+ data[j*n_kv + i] = -1;
8725
+ }
8726
+ }
8727
+ }
8728
+ }
8729
+ }
7840
8730
  }
7841
8731
 
7842
8732
  static void llama_graph_compute(
@@ -7856,9 +8746,10 @@ static void llama_graph_compute(
7856
8746
 
7857
8747
  if (lctx.backend_cpu != nullptr) {
7858
8748
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
8749
+ ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
7859
8750
  }
7860
8751
 
7861
- ggml_backend_sched_graph_compute(lctx.sched, gf);
8752
+ ggml_backend_sched_graph_compute_async(lctx.sched, gf);
7862
8753
 
7863
8754
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
7864
8755
 
@@ -7878,10 +8769,11 @@ static void llama_graph_compute(
7878
8769
  //
7879
8770
  static int llama_decode_internal(
7880
8771
  llama_context & lctx,
7881
- llama_batch batch) {
7882
- const uint32_t n_tokens = batch.n_tokens;
8772
+ llama_batch batch_all) { // TODO: rename back to batch
8773
+
8774
+ const uint32_t n_tokens_all = batch_all.n_tokens;
7883
8775
 
7884
- if (n_tokens == 0) {
8776
+ if (n_tokens_all == 0) {
7885
8777
  LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
7886
8778
  return -1;
7887
8779
  }
@@ -7890,14 +8782,16 @@ static int llama_decode_internal(
7890
8782
  const auto & hparams = model.hparams;
7891
8783
  const auto & cparams = lctx.cparams;
7892
8784
 
7893
- const auto n_batch = cparams.n_batch;
8785
+ GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
7894
8786
 
7895
- GGML_ASSERT(n_tokens <= n_batch);
7896
- GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
8787
+ GGML_ASSERT(n_tokens_all <= cparams.n_batch);
7897
8788
 
7898
- int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8789
+ GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
7899
8790
 
7900
- const int64_t t_start_us = ggml_time_us();
8791
+ if (lctx.t_compute_start_us == 0) {
8792
+ lctx.t_compute_start_us = ggml_time_us();
8793
+ }
8794
+ lctx.n_queued_tokens += n_tokens_all;
7901
8795
 
7902
8796
  #ifdef GGML_USE_MPI
7903
8797
  // TODO: needs fix after #3228
@@ -7905,213 +8799,274 @@ static int llama_decode_internal(
7905
8799
  //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
7906
8800
  #endif
7907
8801
 
7908
- GGML_ASSERT(n_threads > 0);
7909
-
7910
8802
  auto & kv_self = lctx.kv_self;
7911
8803
 
7912
8804
  const int64_t n_embd = hparams.n_embd;
7913
8805
  const int64_t n_vocab = hparams.n_vocab;
7914
8806
 
7915
- // helpers for smoother batch API transition
7916
- // after deprecating the llama_eval calls, these will be removed
7917
- std::vector<llama_pos> pos;
7918
8807
 
7919
- std::vector<int32_t> n_seq_id;
7920
- std::vector<llama_seq_id *> seq_id_arr;
7921
- std::vector<std::vector<llama_seq_id>> seq_id;
8808
+ auto * logits_out = lctx.logits;
7922
8809
 
7923
- if (batch.pos == nullptr) {
7924
- pos.resize(n_tokens);
7925
- for (uint32_t i = 0; i < n_tokens; i++) {
7926
- pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
7927
- }
8810
+ #ifndef NDEBUG
8811
+ auto & logits_valid = lctx.logits_valid;
8812
+ logits_valid.clear();
8813
+ logits_valid.resize(n_tokens_all);
7928
8814
 
7929
- batch.pos = pos.data();
7930
- }
8815
+ memset(logits_out, 0, lctx.logits_size*sizeof(float));
8816
+ #endif
7931
8817
 
7932
- if (batch.seq_id == nullptr) {
7933
- n_seq_id.resize(n_tokens);
7934
- seq_id.resize(n_tokens);
7935
- seq_id_arr.resize(n_tokens);
7936
- for (uint32_t i = 0; i < n_tokens; i++) {
7937
- n_seq_id[i] = 1;
7938
- seq_id[i].resize(1);
7939
- seq_id[i][0] = batch.all_seq_id;
7940
- seq_id_arr[i] = seq_id[i].data();
7941
- }
8818
+ const auto n_ubatch = cparams.n_ubatch;
7942
8819
 
7943
- batch.n_seq_id = n_seq_id.data();
7944
- batch.seq_id = seq_id_arr.data();
7945
- }
8820
+ std::vector<llama_pos> pos;
8821
+ std::vector<int32_t> n_seq_id;
8822
+ std::vector<llama_seq_id *> seq_id_arr;
8823
+ std::vector<std::vector<llama_seq_id>> seq_id;
7946
8824
 
7947
- llama_kv_cache_update(&lctx);
8825
+ for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
8826
+ const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
8827
+ llama_batch u_batch = {
8828
+ /* .n_tokens = */ (int32_t) n_tokens,
8829
+ /* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr,
8830
+ /* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr,
8831
+ /* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr,
8832
+ /* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr,
8833
+ /* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr,
8834
+ /* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr,
8835
+ /* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
8836
+ /* .all_pos_1 = */ batch_all.all_pos_1,
8837
+ /* .all_seq_id = */ batch_all.all_seq_id,
8838
+ };
7948
8839
 
7949
- // if we have enough unused cells before the current head ->
7950
- // better to start searching from the beginning of the cache, hoping to fill it
7951
- if (kv_self.head > kv_self.used + 2*n_tokens) {
7952
- kv_self.head = 0;
7953
- }
8840
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
8841
+ GGML_ASSERT(n_threads > 0);
7954
8842
 
7955
- if (!llama_kv_cache_find_slot(kv_self, batch)) {
7956
- return 1;
7957
- }
8843
+ // helpers for smoother batch API transition
8844
+ // after deprecating the llama_eval calls, these will be removed
8845
+ if (u_batch.pos == nullptr) {
8846
+ pos.resize(n_tokens);
8847
+ for (uint32_t i = 0; i < n_tokens; i++) {
8848
+ pos[i] = u_batch.all_pos_0 + i*u_batch.all_pos_1;
8849
+ }
7958
8850
 
7959
- // a heuristic, to avoid attending the full cache if it is not yet utilized
7960
- // after enough generations, the benefit from this heuristic disappears
7961
- // if we start defragmenting the cache, the benefit from this will be more important
7962
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
7963
- //kv_self.n = llama_kv_cache_cell_max(kv_self);
8851
+ u_batch.pos = pos.data();
8852
+ }
7964
8853
 
7965
- //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
8854
+ if (u_batch.seq_id == nullptr) {
8855
+ n_seq_id.resize(n_tokens);
8856
+ seq_id.resize(n_tokens);
8857
+ seq_id_arr.resize(n_tokens);
8858
+ for (uint32_t i = 0; i < n_tokens; i++) {
8859
+ n_seq_id[i] = 1;
8860
+ seq_id[i].resize(1);
8861
+ seq_id[i][0] = u_batch.all_seq_id;
8862
+ seq_id_arr[i] = seq_id[i].data();
8863
+ }
7966
8864
 
7967
- ggml_backend_sched_reset(lctx.sched);
7968
- ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
8865
+ u_batch.n_seq_id = n_seq_id.data();
8866
+ u_batch.seq_id = seq_id_arr.data();
8867
+ }
7969
8868
 
7970
- ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
8869
+ // non-causal masks do not use the KV cache
8870
+ if (hparams.causal_attn) {
8871
+ llama_kv_cache_update(&lctx);
7971
8872
 
7972
- // the output is always the last tensor in the graph
7973
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7974
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
8873
+ // if we have enough unused cells before the current head ->
8874
+ // better to start searching from the beginning of the cache, hoping to fill it
8875
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
8876
+ kv_self.head = 0;
8877
+ }
8878
+
8879
+ if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
8880
+ return 1;
8881
+ }
7975
8882
 
7976
- if (strcmp(res->name, "result_output") == 0) {
7977
- // the embeddings could be the second to last tensor, or the third to last tensor
7978
- if (strcmp(embeddings->name, "result_norm") != 0) {
7979
- embeddings = gf->nodes[gf->n_nodes - 3];
7980
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
8883
+ if (!kv_self.recurrent) {
8884
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
8885
+ // after enough generations, the benefit from this heuristic disappears
8886
+ // if we start defragmenting the cache, the benefit from this will be more important
8887
+ kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
8888
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
8889
+ }
7981
8890
  }
7982
- } else if (strcmp(res->name, "result_embd") == 0) {
7983
- embeddings = res;
7984
- res = nullptr;
7985
- } else {
7986
- GGML_ASSERT(false);
7987
- }
7988
8891
 
7989
- // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
8892
+ //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
7990
8893
 
7991
- // for big prompts, if BLAS is enabled, it is better to use only one thread
7992
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
7993
- // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
7994
- // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
7995
- // with the BLAS calls. need a better solution
7996
- // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
7997
- // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
7998
- if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7999
- n_threads = std::min(4, n_threads);
8000
- }
8894
+ ggml_backend_sched_reset(lctx.sched);
8895
+ ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
8001
8896
 
8002
- llama_set_inputs(lctx, batch);
8897
+ ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false);
8003
8898
 
8004
- llama_graph_compute(lctx, gf, n_threads);
8899
+ // the output is always the last tensor in the graph
8900
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
8901
+ struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
8005
8902
 
8006
- // update the kv ring buffer
8007
- {
8008
- kv_self.head += n_tokens;
8903
+ if (!hparams.causal_attn) {
8904
+ res = nullptr; // do not extract logits for embedding models such as BERT
8009
8905
 
8010
- // Ensure kv cache head points to a valid index.
8011
- if (kv_self.head >= kv_self.size) {
8012
- kv_self.head = 0;
8013
- }
8014
- }
8015
-
8016
- // decide if we need to defrag the kv cache
8017
- if (cparams.defrag_thold >= 0.0f) {
8018
- const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
8906
+ // token or sequence embeddings
8907
+ embd = gf->nodes[gf->n_nodes - 1];
8019
8908
 
8020
- // queue defragmentation for next llama_kv_cache_update
8021
- if (fragmentation > cparams.defrag_thold) {
8022
- //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8909
+ GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
8910
+ } else {
8911
+ if (strcmp(res->name, "result_output") == 0) {
8912
+ // the token embeddings could be the second to last tensor, or the third to last tensor
8913
+ if (strcmp(embd->name, "result_norm") != 0) {
8914
+ embd = gf->nodes[gf->n_nodes - 3];
8915
+ GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
8916
+ }
8917
+ } else {
8918
+ GGML_ASSERT(false && "missing result_output tensor");
8919
+ }
8920
+ }
8921
+ // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
8023
8922
 
8024
- llama_kv_cache_defrag(kv_self);
8923
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
8924
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
8925
+ // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
8926
+ // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
8927
+ // with the BLAS calls. need a better solution
8928
+ // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
8929
+ // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
8930
+ if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
8931
+ n_threads = std::min(4, n_threads);
8025
8932
  }
8026
- }
8027
8933
 
8028
- #ifdef GGML_PERF
8029
- // print timing information per ggml operation (for debugging purposes)
8030
- // requires GGML_PERF to be defined
8031
- ggml_graph_print(gf);
8032
- #endif
8934
+ ggml_backend_sched_alloc_graph(lctx.sched, gf);
8033
8935
 
8034
- // plot the computation graph in dot format (for debugging purposes)
8035
- //if (n_past%100 == 0) {
8036
- // ggml_graph_dump_dot(gf, NULL, "llama.dot");
8037
- //}
8936
+ llama_set_inputs(lctx, u_batch);
8038
8937
 
8039
- // extract logits
8040
- // TODO: do not compute and extract logits if only embeddings are needed
8041
- // need to update the graphs to skip "result_output"
8042
- if (res) {
8043
- auto & logits_out = lctx.logits;
8938
+ llama_graph_compute(lctx, gf, n_threads);
8044
8939
 
8045
- #ifndef NDEBUG
8046
- auto & logits_valid = lctx.logits_valid;
8047
- logits_valid.clear();
8048
- logits_valid.resize(n_tokens);
8940
+ // update the kv ring buffer
8941
+ {
8942
+ kv_self.head += n_tokens;
8943
+
8944
+ // Ensure kv cache head points to a valid index.
8945
+ if (kv_self.head >= kv_self.size) {
8946
+ kv_self.head = 0;
8947
+ }
8948
+ }
8049
8949
 
8050
- logits_out.clear();
8950
+ #ifdef GGML_PERF
8951
+ // print timing information per ggml operation (for debugging purposes)
8952
+ // requires GGML_PERF to be defined
8953
+ ggml_graph_print(gf);
8051
8954
  #endif
8052
8955
 
8053
- ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
8054
- GGML_ASSERT(res_backend != nullptr);
8055
- if (batch.logits) {
8056
- logits_out.resize(n_vocab * n_tokens);
8057
- for (uint32_t i = 0; i < n_tokens; i++) {
8058
- if (batch.logits[i] == 0) {
8059
- continue;
8060
- }
8061
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8956
+ // plot the computation graph in dot format (for debugging purposes)
8957
+ //if (n_past%100 == 0) {
8958
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
8959
+ //}
8960
+
8961
+ // extract logits
8962
+ // TODO: do not compute and extract logits if only embeddings are needed
8963
+ // update the graphs to skip "result_output" if logits are not needed
8964
+ if (res) {
8965
+ ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
8966
+ GGML_ASSERT(backend_res != nullptr);
8967
+ if (u_batch.logits) {
8968
+ int32_t i_first = -1;
8969
+ for (uint32_t i = 0; i < n_tokens; i++) {
8970
+ if (u_batch.logits[i] && i_first == -1) {
8971
+ i_first = (int32_t) i;
8972
+ }
8973
+ if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
8974
+ if (i_first != -1) {
8975
+ int i_last = u_batch.logits[i] == 0 ? i : i + 1;
8976
+ // extract logits for the range [i_first, i_last)
8977
+ // group the requests to minimize the number of calls to the backend
8978
+ ggml_backend_tensor_get_async(backend_res, res,
8979
+ logits_out + n_vocab*(cur_token + i_first),
8980
+ i_first*n_vocab*sizeof(float),
8981
+ (i_last - i_first)*n_vocab*sizeof(float));
8982
+ i_first = -1;
8983
+ }
8984
+ }
8062
8985
  #ifndef NDEBUG
8063
- logits_valid[i] = true;
8986
+ logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
8064
8987
  #endif
8065
- }
8066
- } else if (lctx.logits_all) {
8067
- logits_out.resize(n_vocab * n_tokens);
8068
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8988
+ }
8989
+ } else if (lctx.logits_all) {
8990
+ ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
8069
8991
  #ifndef NDEBUG
8070
- std::fill(logits_valid.begin(), logits_valid.end(), true);
8992
+ std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
8071
8993
  #endif
8072
- } else {
8073
- logits_out.resize(n_vocab);
8074
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8994
+ } else {
8995
+ if (cur_token + n_tokens >= n_tokens_all) {
8996
+ ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
8075
8997
  #ifndef NDEBUG
8076
- logits_valid[0] = true;
8998
+ logits_valid[0] = true;
8077
8999
  #endif
9000
+ }
9001
+ }
8078
9002
  }
8079
- ggml_backend_synchronize(res_backend);
8080
- }
8081
9003
 
8082
- // extract embeddings
8083
- if (!lctx.embedding.empty()) {
8084
- auto & embedding_out = lctx.embedding;
9004
+ // extract embeddings
9005
+ if (cparams.embeddings && embd) {
9006
+ ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
9007
+ GGML_ASSERT(backend_embd != nullptr);
8085
9008
 
8086
- const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
8087
- const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
9009
+ switch (cparams.pooling_type) {
9010
+ case LLAMA_POOLING_TYPE_NONE:
9011
+ {
9012
+ // extract token embeddings
9013
+ auto & embd_out = lctx.embd;
9014
+
9015
+ if (u_batch.logits) {
9016
+ //embd_out.resize(n_embd * n_tokens);
9017
+ for (uint32_t i = 0; i < n_tokens; i++) {
9018
+ if (u_batch.logits[i] == 0) {
9019
+ continue;
9020
+ }
9021
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
9022
+ }
9023
+ }
9024
+ } break;
9025
+ case LLAMA_POOLING_TYPE_CLS:
9026
+ case LLAMA_POOLING_TYPE_MEAN:
9027
+ {
9028
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
8088
9029
 
8089
- embedding_out.resize(embd_size);
8090
- ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8091
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8092
- ggml_backend_synchronize(embeddings_backend);
8093
- }
9030
+ // extract sequence embeddings
9031
+ auto & embd_seq_out = lctx.embd_seq;
9032
+ embd_seq_out.clear();
8094
9033
 
8095
- // measure the performance only for the single-token evals
8096
- if (n_tokens == 1) {
8097
- lctx.t_eval_us += ggml_time_us() - t_start_us;
8098
- lctx.n_eval++;
8099
- }
8100
- else if (n_tokens > 1) {
8101
- lctx.t_p_eval_us += ggml_time_us() - t_start_us;
8102
- lctx.n_p_eval += n_tokens;
9034
+ for (uint32_t i = 0; i < n_tokens; i++) {
9035
+ const llama_seq_id seq_id = u_batch.seq_id[i][0];
9036
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
9037
+ continue;
9038
+ }
9039
+ embd_seq_out[seq_id].resize(n_embd);
9040
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
9041
+ }
9042
+ } break;
9043
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
9044
+ {
9045
+ GGML_ASSERT(false && "unknown pooling type");
9046
+ } break;
9047
+ }
9048
+ }
8103
9049
  }
8104
9050
 
8105
- // get a more accurate load time, upon first eval
8106
- // TODO: fix this
8107
- if (!lctx.has_evaluated_once) {
8108
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
8109
- lctx.has_evaluated_once = true;
9051
+ // wait for the computation to finish (automatically done when obtaining the model output)
9052
+ //llama_synchronize(&lctx);
9053
+
9054
+ // decide if we need to defrag the kv cache
9055
+ if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
9056
+ const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
9057
+
9058
+ // queue defragmentation for next llama_kv_cache_update
9059
+ if (fragmentation > cparams.defrag_thold) {
9060
+ //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
9061
+
9062
+ llama_kv_cache_defrag(kv_self);
9063
+ }
8110
9064
  }
8111
9065
 
8112
9066
  return 0;
8113
9067
  }
8114
9068
 
9069
+
8115
9070
  // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
8116
9071
  static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8117
9072
  auto & kv_self = lctx.kv_self;
@@ -8130,6 +9085,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8130
9085
  // number of cells moved
8131
9086
  uint32_t n_moves = 0;
8132
9087
 
9088
+ // each move requires 6*n_layer tensors (see build_defrag)
9089
+ // - source view, destination view, copy operation
9090
+ // - x2 for keys and values
9091
+ const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
9092
+
8133
9093
  // determine which KV cells to move where
8134
9094
  //
8135
9095
  // cell i moves to ids[i]
@@ -8156,15 +9116,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8156
9116
  nh++;
8157
9117
  }
8158
9118
 
8159
- // each move requires 6*n_layer tensors (see build_defrag)
8160
- // - source view, destination view, copy operation
8161
- // - x2 for keys and values
8162
- //
8163
- if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
8164
- // the graph is too big, we cannot move more cells
8165
- break;
8166
- }
8167
-
8168
9119
  uint32_t nf = 0;
8169
9120
  uint32_t is = n_kv - 1;
8170
9121
 
@@ -8194,11 +9145,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8194
9145
  // are we moving a continuous block of memory?
8195
9146
  bool cont = false;
8196
9147
 
9148
+ // should we stop searching for the next move?
9149
+ bool stop = false;
9150
+
8197
9151
  // go back and move the nf cells to the hole
8198
9152
  for (; i1 < n_kv; ++i1) {
8199
9153
  auto & cell1 = kv_self.cells[i1];
8200
9154
 
8201
9155
  if (cell1.is_empty() || ids[i1] != n_kv) {
9156
+ if (n_moves == max_moves) {
9157
+ stop = true;
9158
+ break;
9159
+ }
9160
+
8202
9161
  cont = false;
8203
9162
  continue;
8204
9163
  }
@@ -8225,6 +9184,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8225
9184
  }
8226
9185
  }
8227
9186
 
9187
+ if (stop || n_moves == max_moves) {
9188
+ break;
9189
+ }
9190
+
8228
9191
  //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
8229
9192
 
8230
9193
  i0 += nh - 1;
@@ -8311,6 +9274,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8311
9274
  #else
8312
9275
  // ggml_graph defrag
8313
9276
 
9277
+ ggml_backend_sched_reset(lctx.sched);
9278
+
8314
9279
  ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
8315
9280
 
8316
9281
  llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
@@ -8322,14 +9287,22 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8322
9287
  }
8323
9288
 
8324
9289
  static void llama_kv_cache_update_internal(struct llama_context & lctx) {
9290
+ bool need_reserve = false;
9291
+
8325
9292
  // apply K-shift if needed
8326
9293
  if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
8327
- llama_set_k_shift(lctx);
8328
-
8329
9294
  {
9295
+ ggml_backend_sched_reset(lctx.sched);
9296
+
8330
9297
  ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
8331
9298
 
9299
+ ggml_backend_sched_alloc_graph(lctx.sched, gf);
9300
+
9301
+ llama_set_k_shift(lctx);
9302
+
8332
9303
  llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
9304
+
9305
+ need_reserve = true;
8333
9306
  }
8334
9307
 
8335
9308
  {
@@ -8343,12 +9316,56 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
8343
9316
  }
8344
9317
  }
8345
9318
 
9319
+ if (lctx.kv_self.recurrent && lctx.kv_self.do_copy) {
9320
+ {
9321
+ ggml_backend_sched_reset(lctx.sched);
9322
+
9323
+ ggml_cgraph * gf = llama_build_graph_s_copy(lctx);
9324
+
9325
+ ggml_backend_sched_alloc_graph(lctx.sched, gf);
9326
+
9327
+ llama_set_s_copy(lctx);
9328
+
9329
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
9330
+
9331
+ need_reserve = true;
9332
+ }
9333
+
9334
+ {
9335
+ auto & kv_self = lctx.kv_self;
9336
+
9337
+ kv_self.do_copy = false;
9338
+
9339
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
9340
+ kv_self.cells[i].src = i;
9341
+ }
9342
+ }
9343
+ }
9344
+
8346
9345
  // defragment the KV cache if needed
8347
9346
  if (lctx.kv_self.do_defrag) {
8348
9347
  llama_kv_cache_defrag_internal(lctx);
8349
9348
 
9349
+ need_reserve = true;
9350
+
8350
9351
  lctx.kv_self.do_defrag = false;
8351
9352
  }
9353
+
9354
+ // reserve a worst case graph again
9355
+ if (need_reserve) {
9356
+ // TODO: extract to a function
9357
+ // build worst-case graph
9358
+ int n_tokens = (int)std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
9359
+ int n_past = lctx.cparams.n_ctx - n_tokens;
9360
+ llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
9361
+ ggml_cgraph * gf = llama_build_graph(lctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
9362
+
9363
+ // initialize scheduler with the worst-case graph
9364
+ ggml_backend_sched_reset(lctx.sched);
9365
+ if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
9366
+ LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
9367
+ }
9368
+ }
8352
9369
  }
8353
9370
 
8354
9371
  //
@@ -8360,46 +9377,53 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
8360
9377
  }
8361
9378
 
8362
9379
  static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
9380
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8363
9381
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
8364
9382
  }
8365
9383
 
8366
9384
  static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
9385
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8367
9386
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
8368
9387
  }
8369
9388
 
8370
9389
  static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
9390
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8371
9391
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
8372
9392
  }
8373
9393
 
8374
9394
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
9395
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8375
9396
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
8376
9397
  }
8377
9398
 
8378
9399
  static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
9400
+ GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
8379
9401
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
8380
9402
  }
8381
9403
 
8382
9404
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
9405
+ GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
8383
9406
  GGML_ASSERT(llama_is_byte_token(vocab, id));
8384
9407
  const auto& token_data = vocab.id_to_token.at(id);
8385
9408
  switch (llama_vocab_get_type(vocab)) {
8386
- case LLAMA_VOCAB_TYPE_SPM: {
8387
- auto buf = token_data.text.substr(3, 2);
8388
- return strtol(buf.c_str(), NULL, 16);
8389
- }
8390
- case LLAMA_VOCAB_TYPE_BPE: {
8391
- GGML_ASSERT(false);
8392
- return unicode_to_bytes_bpe(token_data.text);
8393
- }
8394
- case LLAMA_VOCAB_TYPE_WPM: {
8395
- GGML_ASSERT(false);
8396
- }
8397
- default:
8398
- GGML_ASSERT(false);
9409
+ case LLAMA_VOCAB_TYPE_SPM: {
9410
+ auto buf = token_data.text.substr(3, 2);
9411
+ return strtol(buf.c_str(), NULL, 16);
9412
+ }
9413
+ case LLAMA_VOCAB_TYPE_BPE: {
9414
+ GGML_ASSERT(false);
9415
+ return unicode_utf8_to_byte(token_data.text);
9416
+ }
9417
+ case LLAMA_VOCAB_TYPE_WPM: {
9418
+ GGML_ASSERT(false);
9419
+ }
9420
+ default:
9421
+ GGML_ASSERT(false);
8399
9422
  }
8400
9423
  }
8401
9424
 
8402
9425
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
9426
+ GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
8403
9427
  static const char * hex = "0123456789ABCDEF";
8404
9428
  switch (llama_vocab_get_type(vocab)) {
8405
9429
  case LLAMA_VOCAB_TYPE_SPM: {
@@ -8414,7 +9438,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
8414
9438
  }
8415
9439
  case LLAMA_VOCAB_TYPE_WPM:
8416
9440
  case LLAMA_VOCAB_TYPE_BPE: {
8417
- return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
9441
+ return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
8418
9442
  }
8419
9443
  default:
8420
9444
  GGML_ASSERT(false);
@@ -8754,9 +9778,9 @@ private:
8754
9778
  bpe_words.reserve(text.size());
8755
9779
  bpe_encoded_words.reserve(text.size());
8756
9780
 
8757
- auto cps = codepoints_from_utf8(text);
8758
- for (size_t i = 0; i < cps.size(); ++i)
8759
- text_utf.emplace_back(codepoint_to_utf8(cps[i]));
9781
+ const auto cpts = unicode_cpts_from_utf8(text);
9782
+ for (size_t i = 0; i < cpts.size(); ++i)
9783
+ text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
8760
9784
 
8761
9785
  for (int i = 0; i < (int)text_utf.size(); i++) {
8762
9786
  const std::string & utf_char = text_utf[i];
@@ -8806,40 +9830,40 @@ private:
8806
9830
  }
8807
9831
 
8808
9832
  if (!split_condition && !collecting) {
8809
- if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
9833
+ if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
8810
9834
  collecting_letter = true;
8811
9835
  collecting = true;
8812
9836
  }
8813
- else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
9837
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
8814
9838
  collecting_numeric = true;
8815
9839
  collecting = true;
8816
9840
  }
8817
9841
  else if (
8818
- ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
8819
- (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
9842
+ ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
9843
+ (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
8820
9844
  ) {
8821
9845
  collecting_special = true;
8822
9846
  collecting = true;
8823
9847
  }
8824
- else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
9848
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
8825
9849
  collecting_whitespace_lookahead = true;
8826
9850
  collecting = true;
8827
9851
  }
8828
- else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
9852
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
8829
9853
  split_condition = true;
8830
9854
  }
8831
9855
  }
8832
9856
  else if (!split_condition && collecting) {
8833
- if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
9857
+ if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
8834
9858
  split_condition = true;
8835
9859
  }
8836
- else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
9860
+ else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
8837
9861
  split_condition = true;
8838
9862
  }
8839
- else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
9863
+ else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
8840
9864
  split_condition = true;
8841
9865
  }
8842
- else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
9866
+ else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
8843
9867
  split_condition = true;
8844
9868
  }
8845
9869
  }
@@ -8868,7 +9892,7 @@ private:
8868
9892
  for (std::string & word : bpe_words) {
8869
9893
  std::string encoded_token = "";
8870
9894
  for (char & c : word) {
8871
- encoded_token += bytes_to_unicode_bpe(c);
9895
+ encoded_token += unicode_byte_to_utf8(c);
8872
9896
  }
8873
9897
  bpe_encoded_words.emplace_back(encoded_token);
8874
9898
  }
@@ -8942,25 +9966,13 @@ struct llm_tokenizer_wpm {
8942
9966
  }
8943
9967
 
8944
9968
  std::vector<std::string> preprocess(const std::string & text) {
8945
- // normalalization form D
8946
- std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
8947
- std::vector<uint32_t> nfd_codepoints;
8948
- for (uint32_t code : codepoints) {
8949
- auto it = nfd_map.equal_range(code);
8950
- if (it.first != it.second) {
8951
- for (auto jt = it.first; jt != it.second; jt++) {
8952
- nfd_codepoints.push_back(jt->second);
8953
- }
8954
- } else {
8955
- nfd_codepoints.push_back(code);
8956
- }
8957
- }
9969
+ std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
8958
9970
 
8959
9971
  // strip accents, strip control, uniformize whitespace,
8960
9972
  // to lowercase, pad chinese characters, pad punctuation
8961
9973
  std::string new_str = "";
8962
- for (uint32_t code : nfd_codepoints) {
8963
- int type = codepoint_type(code);
9974
+ for (uint32_t code : cpts_nfd) {
9975
+ int type = unicode_cpt_type(code);
8964
9976
  if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
8965
9977
  continue;
8966
9978
  }
@@ -8968,7 +9980,7 @@ struct llm_tokenizer_wpm {
8968
9980
  if (type == CODEPOINT_TYPE_WHITESPACE) {
8969
9981
  code = ' ';
8970
9982
  }
8971
- std::string s = codepoint_to_utf8(code);
9983
+ std::string s = unicode_cpt_to_utf8(code);
8972
9984
  if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
8973
9985
  new_str += " ";
8974
9986
  new_str += s;
@@ -8988,8 +10000,7 @@ struct llm_tokenizer_wpm {
8988
10000
  if (r > l) words.push_back(new_str.substr(l, (r - l)));
8989
10001
  l = r + 1;
8990
10002
  r = l;
8991
- }
8992
- else {
10003
+ } else {
8993
10004
  r += 1;
8994
10005
  }
8995
10006
  }
@@ -9013,17 +10024,17 @@ struct llm_tokenizer_wpm {
9013
10024
  return code < 256 && ispunct(code);
9014
10025
  }
9015
10026
 
9016
- bool is_chinese_char(uint32_t codepoint) {
9017
- if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
9018
- (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
9019
- (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
9020
- (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
9021
- (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
9022
- (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
9023
- (codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
9024
- (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
9025
- (codepoint >= 0x3000 && codepoint <= 0x303F) ||
9026
- (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
10027
+ bool is_chinese_char(uint32_t cpt) {
10028
+ if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
10029
+ (cpt >= 0x3400 && cpt <= 0x4DBF) ||
10030
+ (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
10031
+ (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
10032
+ (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
10033
+ (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
10034
+ (cpt >= 0xF900 && cpt <= 0xFAFF) ||
10035
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
10036
+ (cpt >= 0x3000 && cpt <= 0x303F) ||
10037
+ (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
9027
10038
  return true; // NOLINT
9028
10039
  }
9029
10040
  return false;
@@ -9244,6 +10255,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
9244
10255
  }
9245
10256
  }
9246
10257
  } break;
10258
+ case LLAMA_VOCAB_TYPE_NONE:
10259
+ GGML_ASSERT(false);
9247
10260
  }
9248
10261
 
9249
10262
  return output;
@@ -9600,7 +10613,7 @@ struct llama_grammar * llama_grammar_init(
9600
10613
 
9601
10614
  // loop over alternates of start rule to build initial stacks
9602
10615
  std::vector<std::vector<const llama_grammar_element *>> stacks;
9603
- pos = rules[start_rule_index];
10616
+ pos = vec_rules[start_rule_index].data();
9604
10617
  do {
9605
10618
  std::vector<const llama_grammar_element *> stack;
9606
10619
  if (!llama_grammar_is_end_of_sequence(pos)) {
@@ -10615,13 +11628,16 @@ struct quantize_state_internal {
10615
11628
 
10616
11629
  bool has_imatrix = false;
10617
11630
 
11631
+ // used to figure out if a model shares tok_embd with the output weight
11632
+ bool has_output = false;
11633
+
10618
11634
  quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
10619
11635
  : model(model)
10620
11636
  , params(params)
10621
11637
  {}
10622
11638
  };
10623
11639
 
10624
- static void llama_convert_tensor_internal(
11640
+ static void llama_tensor_dequantize_internal(
10625
11641
  struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
10626
11642
  const size_t nelements, const int nthread
10627
11643
  ) {
@@ -10682,7 +11698,7 @@ static void llama_convert_tensor_internal(
10682
11698
  workers.clear();
10683
11699
  }
10684
11700
 
10685
- static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
11701
+ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
10686
11702
  const std::string name = ggml_get_name(tensor);
10687
11703
 
10688
11704
  // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -10712,8 +11728,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10712
11728
 
10713
11729
  // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
10714
11730
  // with the quantization of the output tensor
10715
- if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
10716
- (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
11731
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
10717
11732
  int nx = tensor->ne[0];
10718
11733
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10719
11734
  new_type = GGML_TYPE_Q8_0;
@@ -10962,41 +11977,76 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10962
11977
  return new_type;
10963
11978
  }
10964
11979
 
11980
+ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
11981
+ std::mutex mutex;
11982
+ int counter = 0;
11983
+ size_t new_size = 0;
11984
+ if (nthread < 2) {
11985
+ // single-thread
11986
+ return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
11987
+ }
11988
+ auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
11989
+ nrows, n_per_row, imatrix]() {
11990
+ const int nrows_per_chunk = chunk_size / n_per_row;
11991
+ size_t local_size = 0;
11992
+ while (true) {
11993
+ std::unique_lock<std::mutex> lock(mutex);
11994
+ int first_row = counter; counter += nrows_per_chunk;
11995
+ if (first_row >= nrows) {
11996
+ if (local_size > 0) {
11997
+ new_size += local_size;
11998
+ }
11999
+ break;
12000
+ }
12001
+ lock.unlock();
12002
+ const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
12003
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
12004
+ }
12005
+ };
12006
+ for (int it = 0; it < nthread - 1; ++it) {
12007
+ workers.emplace_back(compute);
12008
+ }
12009
+ compute();
12010
+ for (auto & w : workers) { w.join(); }
12011
+ workers.clear();
12012
+ return new_size;
12013
+ }
12014
+
10965
12015
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
10966
- ggml_type quantized_type;
12016
+ ggml_type default_type;
10967
12017
  llama_ftype ftype = params->ftype;
10968
12018
 
10969
12019
  switch (params->ftype) {
10970
- case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
10971
- case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
10972
- case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
10973
- case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
10974
- case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
10975
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
10976
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
12020
+ case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
12021
+ case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
12022
+ case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
12023
+ case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
12024
+ case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
12025
+ case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
12026
+ case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
10977
12027
 
10978
12028
  // K-quants
10979
12029
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10980
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10981
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
12030
+ case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
12031
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
10982
12032
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10983
12033
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10984
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
12034
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
10985
12035
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
10986
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
12036
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
10987
12037
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
10988
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
10989
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10990
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10991
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10992
- case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
10993
- case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
10994
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10995
- case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
10996
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
10997
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
10998
- case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
10999
- case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
12038
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
12039
+ case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
12040
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
12041
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
12042
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
12043
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
12044
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
12045
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
12046
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
12047
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
12048
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
12049
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
11000
12050
 
11001
12051
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
11002
12052
  }
@@ -11062,6 +12112,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11062
12112
  else if (name.find("ffn_up") != std::string::npos) {
11063
12113
  ++qs.n_ffn_up;
11064
12114
  }
12115
+ else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
12116
+ qs.has_output = true;
12117
+ }
11065
12118
  }
11066
12119
  if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
11067
12120
  LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
@@ -11070,11 +12123,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11070
12123
 
11071
12124
  size_t total_size_org = 0;
11072
12125
  size_t total_size_new = 0;
11073
- std::vector<int64_t> hist_all(1 << 4, 0);
11074
12126
 
11075
12127
  std::vector<std::thread> workers;
11076
12128
  workers.reserve(nthread);
11077
- std::mutex mutex;
11078
12129
 
11079
12130
  int idx = 0;
11080
12131
 
@@ -11133,20 +12184,29 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11133
12184
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
11134
12185
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
11135
12186
 
12187
+ // do not quantize Mamba's small yet 2D weights
12188
+ // NOTE: can't use LLM_TN here because the layer number is not known
12189
+ quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
12190
+ quantize &= name.find("ssm_x.weight") == std::string::npos;
12191
+ quantize &= name.find("ssm_dt.weight") == std::string::npos;
12192
+
11136
12193
  enum ggml_type new_type;
11137
12194
  void * new_data;
11138
12195
  size_t new_size;
11139
12196
 
11140
12197
  if (quantize) {
11141
- new_type = quantized_type;
11142
- if (!params->pure) {
11143
- new_type = get_k_quant_type(qs, new_type, tensor, ftype);
12198
+ new_type = default_type;
12199
+
12200
+ // get more optimal quantization type based on the tensor shape, layer, etc.
12201
+ if (!params->pure && ggml_is_quantized(default_type)) {
12202
+ new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
11144
12203
  }
11145
12204
 
11146
12205
  // If we've decided to quantize to the same type the tensor is already
11147
12206
  // in then there's nothing to do.
11148
12207
  quantize = tensor->type != new_type;
11149
12208
  }
12209
+
11150
12210
  if (!quantize) {
11151
12211
  new_type = tensor->type;
11152
12212
  new_data = tensor->data;
@@ -11188,18 +12248,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11188
12248
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
11189
12249
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
11190
12250
  } else {
11191
- llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
12251
+ llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
11192
12252
  f32_data = (float *) f32_conv_buf.data();
11193
12253
  }
11194
12254
 
11195
- LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
12255
+ LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
11196
12256
  fflush(stdout);
11197
12257
 
11198
12258
  if (work.size() < nelements * 4) {
11199
12259
  work.resize(nelements * 4); // upper bound on size
11200
12260
  }
11201
12261
  new_data = work.data();
11202
- std::array<int64_t, 1 << 4> hist_cur = {};
11203
12262
 
11204
12263
  const int n_per_row = tensor->ne[0];
11205
12264
  const int nrows = nelements / n_per_row;
@@ -11209,56 +12268,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11209
12268
 
11210
12269
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
11211
12270
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
11212
- if (nthread_use < 2) {
11213
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
11214
- } else {
11215
- int counter = 0;
11216
- new_size = 0;
11217
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
11218
- nrows, n_per_row, imatrix]() {
11219
- std::array<int64_t, 1 << 4> local_hist = {};
11220
- const int nrows_per_chunk = chunk_size / n_per_row;
11221
- size_t local_size = 0;
11222
- while (true) {
11223
- std::unique_lock<std::mutex> lock(mutex);
11224
- int first_row = counter; counter += nrows_per_chunk;
11225
- if (first_row >= nrows) {
11226
- if (local_size > 0) {
11227
- for (int j=0; j<int(local_hist.size()); ++j) {
11228
- hist_cur[j] += local_hist[j];
11229
- }
11230
- new_size += local_size;
11231
- }
11232
- break;
11233
- }
11234
- lock.unlock();
11235
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
11236
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
11237
- first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
11238
- }
11239
- };
11240
- for (int it = 0; it < nthread_use - 1; ++it) {
11241
- workers.emplace_back(compute);
11242
- }
11243
- compute();
11244
- for (auto & w : workers) { w.join(); }
11245
- workers.clear();
11246
- }
11247
-
11248
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
11249
- int64_t tot_count = 0;
11250
- for (size_t i = 0; i < hist_cur.size(); i++) {
11251
- hist_all[i] += hist_cur[i];
11252
- tot_count += hist_cur[i];
11253
- }
12271
+ new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
11254
12272
 
11255
- if (tot_count > 0) {
11256
- LLAMA_LOG_INFO(" | hist: ");
11257
- for (size_t i = 0; i < hist_cur.size(); i++) {
11258
- LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
11259
- }
11260
- }
11261
- LLAMA_LOG_INFO("\n");
12273
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
11262
12274
  }
11263
12275
  total_size_org += ggml_nbytes(tensor);
11264
12276
  total_size_new += new_size;
@@ -11287,24 +12299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11287
12299
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
11288
12300
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
11289
12301
 
11290
- // print histogram for all tensors
11291
- {
11292
- int64_t sum_all = 0;
11293
- for (size_t i = 0; i < hist_all.size(); i++) {
11294
- sum_all += hist_all[i];
11295
- }
11296
-
11297
- if (sum_all > 0) {
11298
- LLAMA_LOG_INFO("%s: hist: ", __func__);
11299
- for (size_t i = 0; i < hist_all.size(); i++) {
11300
- LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
11301
- }
11302
- LLAMA_LOG_INFO("\n");
11303
- }
11304
- }
11305
-
11306
12302
  if (qs.n_fallback > 0) {
11307
- LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
12303
+ LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
11308
12304
  __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
11309
12305
  }
11310
12306
  }
@@ -11616,10 +12612,13 @@ struct llama_context_params llama_context_default_params() {
11616
12612
  struct llama_context_params result = {
11617
12613
  /*.seed =*/ LLAMA_DEFAULT_SEED,
11618
12614
  /*.n_ctx =*/ 512,
11619
- /*.n_batch =*/ 512,
12615
+ /*.n_batch =*/ 2048,
12616
+ /*.n_ubatch =*/ 512,
12617
+ /*.n_seq_max =*/ 1,
11620
12618
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11621
12619
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11622
12620
  /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
12621
+ /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
11623
12622
  /*.rope_freq_base =*/ 0.0f,
11624
12623
  /*.rope_freq_scale =*/ 0.0f,
11625
12624
  /*.yarn_ext_factor =*/ -1.0f,
@@ -11633,9 +12632,10 @@ struct llama_context_params llama_context_default_params() {
11633
12632
  /*.type_k =*/ GGML_TYPE_F16,
11634
12633
  /*.type_v =*/ GGML_TYPE_F16,
11635
12634
  /*.logits_all =*/ false,
11636
- /*.embedding =*/ false,
12635
+ /*.embeddings =*/ false,
11637
12636
  /*.offload_kqv =*/ true,
11638
- /*.do_pooling =*/ true,
12637
+ /*.abort_callback =*/ nullptr,
12638
+ /*.abort_callback_data =*/ nullptr,
11639
12639
  };
11640
12640
 
11641
12641
  return result;
@@ -11767,6 +12767,17 @@ struct llama_context * llama_new_context_with_model(
11767
12767
  struct llama_context_params params) {
11768
12768
 
11769
12769
  if (!model) {
12770
+ LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
12771
+ return nullptr;
12772
+ }
12773
+
12774
+ if (params.n_batch == 0 && params.n_ubatch == 0) {
12775
+ LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
12776
+ return nullptr;
12777
+ }
12778
+
12779
+ if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
12780
+ LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
11770
12781
  return nullptr;
11771
12782
  }
11772
12783
 
@@ -11775,7 +12786,7 @@ struct llama_context * llama_new_context_with_model(
11775
12786
  const auto & hparams = model->hparams;
11776
12787
  auto & cparams = ctx->cparams;
11777
12788
 
11778
- cparams.n_batch = params.n_batch;
12789
+ // TODO: maybe add n_seq_max here too
11779
12790
  cparams.n_threads = params.n_threads;
11780
12791
  cparams.n_threads_batch = params.n_threads_batch;
11781
12792
  cparams.yarn_ext_factor = params.yarn_ext_factor;
@@ -11783,13 +12794,19 @@ struct llama_context * llama_new_context_with_model(
11783
12794
  cparams.yarn_beta_fast = params.yarn_beta_fast;
11784
12795
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11785
12796
  cparams.defrag_thold = params.defrag_thold;
12797
+ cparams.embeddings = params.embeddings;
11786
12798
  cparams.offload_kqv = params.offload_kqv;
11787
- cparams.do_pooling = params.do_pooling;
12799
+ cparams.pooling_type = params.pooling_type;
11788
12800
 
11789
12801
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
11790
12802
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
11791
12803
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
11792
12804
 
12805
+ // with causal attention, the batch size is limited by the context size
12806
+ cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
12807
+ cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
12808
+
12809
+
11793
12810
  cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
11794
12811
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
11795
12812
  hparams.n_ctx_train;
@@ -11810,19 +12827,44 @@ struct llama_context * llama_new_context_with_model(
11810
12827
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
11811
12828
  }
11812
12829
 
12830
+ cparams.causal_attn = hparams.causal_attn;
12831
+
12832
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12833
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12834
+ cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
12835
+ } else {
12836
+ cparams.pooling_type = hparams.pooling_type;
12837
+ }
12838
+ }
12839
+
11813
12840
  if (params.seed == LLAMA_DEFAULT_SEED) {
11814
12841
  params.seed = time(NULL);
11815
12842
  }
11816
12843
 
11817
12844
  LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
12845
+ LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
12846
+ LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
11818
12847
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
11819
12848
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
11820
12849
 
11821
- ctx->rng = std::mt19937(params.seed);
11822
- ctx->logits_all = params.logits_all;
12850
+ ctx->abort_callback = params.abort_callback;
12851
+ ctx->abort_callback_data = params.abort_callback_data;
12852
+
12853
+ ctx->rng = std::mt19937(params.seed);
12854
+ ctx->logits_all = params.logits_all;
11823
12855
 
11824
- const ggml_type type_k = params.type_k;
11825
- const ggml_type type_v = params.type_v;
12856
+ uint32_t kv_size = cparams.n_ctx;
12857
+ ggml_type type_k = params.type_k;
12858
+ ggml_type type_v = params.type_v;
12859
+
12860
+ // Mamba only needs a constant number of KV cache cells per sequence
12861
+ if (model->arch == LLM_ARCH_MAMBA) {
12862
+ // Mamba needs at least as many KV cells as there are sequences kept at any time
12863
+ kv_size = std::max((uint32_t) 1, params.n_seq_max);
12864
+ // it's probably best to keep as much precision as possible for the states
12865
+ type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
12866
+ type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
12867
+ }
11826
12868
 
11827
12869
  GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
11828
12870
  GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
@@ -11877,13 +12919,31 @@ struct llama_context * llama_new_context_with_model(
11877
12919
  }
11878
12920
  #elif defined(GGML_USE_SYCL)
11879
12921
  if (model->n_gpu_layers > 0) {
11880
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
11881
- if (backend == nullptr) {
11882
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
11883
- llama_free(ctx);
11884
- return nullptr;
12922
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12923
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12924
+ int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
12925
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
12926
+ if (backend == nullptr) {
12927
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
12928
+ llama_free(ctx);
12929
+ return nullptr;
12930
+ }
12931
+ ctx->backends.push_back(backend);
12932
+ } else {
12933
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
12934
+ int id_list[GGML_SYCL_MAX_DEVICES];
12935
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12936
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12937
+ int device_id = id_list[i];
12938
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
12939
+ if (backend == nullptr) {
12940
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
12941
+ llama_free(ctx);
12942
+ return nullptr;
12943
+ }
12944
+ ctx->backends.push_back(backend);
12945
+ }
11885
12946
  }
11886
- ctx->backends.push_back(backend);
11887
12947
  }
11888
12948
  #elif defined(GGML_USE_KOMPUTE)
11889
12949
  if (model->n_gpu_layers > 0) {
@@ -11904,7 +12964,7 @@ struct llama_context * llama_new_context_with_model(
11904
12964
  }
11905
12965
  ctx->backends.push_back(ctx->backend_cpu);
11906
12966
 
11907
- if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
12967
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
11908
12968
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
11909
12969
  llama_free(ctx);
11910
12970
  return nullptr;
@@ -11928,45 +12988,31 @@ struct llama_context * llama_new_context_with_model(
11928
12988
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
11929
12989
  }
11930
12990
 
11931
- // resized during inference, reserve maximum
11932
- ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
11933
-
11934
- if (params.embedding) {
11935
- ctx->embedding.resize(hparams.n_embd);
11936
- }
11937
-
11938
- // graph inputs
12991
+ // graph outputs buffer
11939
12992
  {
11940
- ggml_init_params init_params = {
11941
- /* .mem_size */ ggml_tensor_overhead()*8,
11942
- /* .mem_buffer */ nullptr,
11943
- /* .no_alloc */ true,
11944
- };
11945
- ctx->ctx_input = ggml_init(init_params);
12993
+ // resized during inference, reserve maximum
12994
+ ctx->logits_size = hparams.n_vocab*cparams.n_batch;
12995
+ ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
11946
12996
 
11947
- ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11948
- ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
11949
- ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11950
- ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
11951
- ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
11952
- ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11953
- ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11954
- ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
12997
+ const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
12998
+
12999
+ ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
13000
+ if (ctx->buf_output == nullptr) {
13001
+ LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
13002
+ llama_free(ctx);
13003
+ return nullptr;
13004
+ }
13005
+ ggml_backend_buffer_clear(ctx->buf_output, 0);
11955
13006
 
11956
- ggml_set_name(ctx->inp_tokens, "inp_tokens");
11957
- ggml_set_name(ctx->inp_embd, "inp_embd");
11958
- ggml_set_name(ctx->inp_pos, "inp_pos");
11959
- ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
11960
- ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
11961
- ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11962
- ggml_set_name(ctx->inp_mean, "inp_mean");
11963
- ggml_set_name(ctx->inp_cls, "inp_cls");
11964
13007
 
11965
- ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
13008
+ ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
13009
+ if (params.embeddings) {
13010
+ ctx->embd = ctx->logits + ctx->logits_size;
13011
+ }
11966
13012
 
11967
- LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
11968
- ggml_backend_buffer_name(ctx->buf_input),
11969
- ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
13013
+ LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
13014
+ ggml_backend_buffer_name(ctx->buf_output),
13015
+ ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
11970
13016
  }
11971
13017
 
11972
13018
  // scheduler and compute buffers
@@ -11985,10 +13031,21 @@ struct llama_context * llama_new_context_with_model(
11985
13031
  // buffer used to store the computation graph and the tensor meta data
11986
13032
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
11987
13033
 
11988
- ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
13034
+ // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
13035
+ bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
13036
+ #ifndef GGML_USE_CUBLAS
13037
+ // pipeline parallelism requires support for async compute and events
13038
+ // currently this is only implemented in the CUDA backend
13039
+ pipeline_parallel = false;
13040
+ #endif
13041
+ ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
13042
+
13043
+ if (pipeline_parallel) {
13044
+ LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
13045
+ }
11989
13046
 
11990
13047
  // build worst-case graph
11991
- int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
13048
+ int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
11992
13049
  int n_past = cparams.n_ctx - n_tokens;
11993
13050
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
11994
13051
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
@@ -12011,7 +13068,7 @@ struct llama_context * llama_new_context_with_model(
12011
13068
 
12012
13069
  // note: the number of splits during measure is higher than during inference due to the kv shift
12013
13070
  int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
12014
- LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
13071
+ LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
12015
13072
  }
12016
13073
  }
12017
13074
 
@@ -12048,6 +13105,14 @@ uint32_t llama_n_batch(const struct llama_context * ctx) {
12048
13105
  return ctx->cparams.n_batch;
12049
13106
  }
12050
13107
 
13108
+ uint32_t llama_n_ubatch(const struct llama_context * ctx) {
13109
+ return ctx->cparams.n_ubatch;
13110
+ }
13111
+
13112
+ uint32_t llama_n_seq_max(const struct llama_context * ctx) {
13113
+ return ctx->kv_self.size;
13114
+ }
13115
+
12051
13116
  enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
12052
13117
  return model->vocab.type;
12053
13118
  }
@@ -12061,6 +13126,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12061
13126
  case LLM_ARCH_MPT:
12062
13127
  case LLM_ARCH_REFACT:
12063
13128
  case LLM_ARCH_BLOOM:
13129
+ case LLM_ARCH_MAMBA:
12064
13130
  return LLAMA_ROPE_TYPE_NONE;
12065
13131
 
12066
13132
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -12084,6 +13150,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12084
13150
  case LLM_ARCH_QWEN2:
12085
13151
  case LLM_ARCH_PHI2:
12086
13152
  case LLM_ARCH_GEMMA:
13153
+ case LLM_ARCH_STARCODER2:
12087
13154
  return LLAMA_ROPE_TYPE_NEOX;
12088
13155
 
12089
13156
  // all model arches should be listed explicitly here
@@ -12096,7 +13163,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12096
13163
  }
12097
13164
 
12098
13165
  int32_t llama_n_vocab(const struct llama_model * model) {
12099
- return model->vocab.id_to_token.size();
13166
+ return model->hparams.n_vocab;
12100
13167
  }
12101
13168
 
12102
13169
  int32_t llama_n_ctx_train(const struct llama_model * model) {
@@ -12206,10 +13273,10 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
12206
13273
  }
12207
13274
  }
12208
13275
 
12209
- struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
13276
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
12210
13277
  struct llama_kv_cache_view result = {
12211
13278
  /*.n_cells = */ 0,
12212
- /*.n_max_seq = */ n_max_seq,
13279
+ /*.n_seq_max = */ n_seq_max,
12213
13280
  /*.token_count = */ 0,
12214
13281
  /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
12215
13282
  /*.max_contiguous = */ 0,
@@ -12237,7 +13304,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
12237
13304
  void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
12238
13305
  GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
12239
13306
  view->cells = (struct llama_kv_cache_view_cell *)p;
12240
- p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
13307
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
12241
13308
  GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
12242
13309
  view->cells_sequences = (llama_seq_id *)p;
12243
13310
  }
@@ -12251,7 +13318,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
12251
13318
  uint32_t max_contig = 0;
12252
13319
  int32_t max_contig_idx = -1;
12253
13320
 
12254
- for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
13321
+ for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) {
12255
13322
  const size_t curr_size = kv_cells[i].seq_id.size();
12256
13323
  token_count += curr_size;
12257
13324
  c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
@@ -12268,7 +13335,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
12268
13335
 
12269
13336
  int seq_idx = 0;
12270
13337
  for (const llama_seq_id it : kv_cells[i].seq_id) {
12271
- if (seq_idx >= view->n_max_seq) {
13338
+ if (seq_idx >= view->n_seq_max) {
12272
13339
  break;
12273
13340
  }
12274
13341
  cs_curr[seq_idx] = it;
@@ -12277,7 +13344,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
12277
13344
  if (seq_idx != 0) {
12278
13345
  used_cells++;
12279
13346
  }
12280
- for (; seq_idx < view->n_max_seq; seq_idx++) {
13347
+ for (; seq_idx < view->n_seq_max; seq_idx++) {
12281
13348
  cs_curr[seq_idx] = -1;
12282
13349
  }
12283
13350
  }
@@ -12313,8 +13380,8 @@ void llama_kv_cache_clear(struct llama_context * ctx) {
12313
13380
  llama_kv_cache_clear(ctx->kv_self);
12314
13381
  }
12315
13382
 
12316
- void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
12317
- llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
13383
+ bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
13384
+ return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
12318
13385
  }
12319
13386
 
12320
13387
  void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
@@ -12365,12 +13432,17 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12365
13432
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
12366
13433
  const size_t s_logits_size = sizeof(size_t);
12367
13434
  // assume worst case for logits although only currently set ones are serialized
12368
- const size_t s_logits = ctx->logits.capacity() * sizeof(float);
13435
+ const size_t s_logits = ctx->logits_size * sizeof(float);
12369
13436
  const size_t s_embedding_size = sizeof(size_t);
12370
- const size_t s_embedding = ctx->embedding.size() * sizeof(float);
12371
- const size_t s_kv_size = sizeof(size_t);
12372
- const size_t s_kv_ntok = sizeof(int);
13437
+ const size_t s_embedding = ctx->embd_size * sizeof(float);
13438
+ const size_t s_kv_buf_size = sizeof(size_t);
13439
+ const size_t s_kv_head = sizeof(uint32_t);
13440
+ const size_t s_kv_size = sizeof(uint32_t);
13441
+ const size_t s_kv_used = sizeof(uint32_t);
12373
13442
  const size_t s_kv = ctx->kv_self.total_size();
13443
+ // TODO: assume the max is more than 1 seq_id per KV cell
13444
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
13445
+ const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
12374
13446
 
12375
13447
  const size_t s_total = (
12376
13448
  + s_rng_size
@@ -12379,9 +13451,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12379
13451
  + s_logits
12380
13452
  + s_embedding_size
12381
13453
  + s_embedding
13454
+ + s_kv_buf_size
13455
+ + s_kv_head
12382
13456
  + s_kv_size
12383
- + s_kv_ntok
13457
+ + s_kv_used
12384
13458
  + s_kv
13459
+ + s_kv_cells
12385
13460
  );
12386
13461
 
12387
13462
  return s_total;
@@ -12457,23 +13532,23 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12457
13532
 
12458
13533
  // copy logits
12459
13534
  {
12460
- const size_t logits_size = ctx->logits.size();
13535
+ const size_t logits_size = ctx->logits_size;
12461
13536
 
12462
13537
  data_ctx->write(&logits_size, sizeof(logits_size));
12463
13538
 
12464
13539
  if (logits_size) {
12465
- data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
13540
+ data_ctx->write(ctx->logits, logits_size * sizeof(float));
12466
13541
  }
12467
13542
  }
12468
13543
 
12469
13544
  // copy embeddings
12470
13545
  {
12471
- const size_t embedding_size = ctx->embedding.size();
13546
+ const size_t embeddings_size = ctx->embd_size;
12472
13547
 
12473
- data_ctx->write(&embedding_size, sizeof(embedding_size));
13548
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
12474
13549
 
12475
- if (embedding_size) {
12476
- data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
13550
+ if (embeddings_size) {
13551
+ data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
12477
13552
  }
12478
13553
  }
12479
13554
 
@@ -12481,15 +13556,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12481
13556
  {
12482
13557
  const auto & kv_self = ctx->kv_self;
12483
13558
  const auto & hparams = ctx->model.hparams;
12484
- const auto & cparams = ctx->cparams;
12485
13559
 
12486
13560
  const uint32_t n_layer = hparams.n_layer;
12487
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12488
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12489
- const uint32_t n_ctx = cparams.n_ctx;
13561
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13562
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
12490
13563
 
12491
13564
  const size_t kv_buf_size = kv_self.total_size();
12492
- const uint32_t kv_head = kv_self.head;
13565
+ const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
12493
13566
  const uint32_t kv_size = kv_self.size;
12494
13567
  const uint32_t kv_used = kv_self.used;
12495
13568
 
@@ -12507,9 +13580,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12507
13580
  ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
12508
13581
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
12509
13582
 
13583
+ if (kv_self.recurrent) {
13584
+ // v is contiguous for recurrent models
13585
+ // TODO: use other tensors for state models than k and v
13586
+ const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
13587
+
13588
+ tmp_buf.resize(v_size);
13589
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size());
13590
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
13591
+ continue;
13592
+ }
13593
+
12510
13594
  // v is not contiguous, copy row by row
12511
13595
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12512
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
13596
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
12513
13597
 
12514
13598
  tmp_buf.resize(v_row_size);
12515
13599
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
@@ -12519,7 +13603,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12519
13603
  }
12520
13604
  }
12521
13605
 
12522
- for (uint32_t i = 0; i < kv_size; ++i) {
13606
+ for (uint32_t i = 0; i < kv_head; ++i) {
12523
13607
  const auto & cell = kv_self.cells[i];
12524
13608
 
12525
13609
  const llama_pos pos = cell.pos;
@@ -12567,27 +13651,25 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12567
13651
 
12568
13652
  memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
12569
13653
 
12570
- GGML_ASSERT(ctx->logits.capacity() >= logits_size);
13654
+ GGML_ASSERT(ctx->logits_size >= logits_size);
12571
13655
 
12572
13656
  if (logits_size) {
12573
- ctx->logits.resize(logits_size);
12574
-
12575
- memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
13657
+ memcpy(ctx->logits, inp, logits_size * sizeof(float));
12576
13658
  inp += logits_size * sizeof(float);
12577
13659
  }
12578
13660
  }
12579
13661
 
12580
13662
  // set embeddings
12581
13663
  {
12582
- size_t embedding_size;
13664
+ size_t embeddings_size;
12583
13665
 
12584
- memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
13666
+ memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
12585
13667
 
12586
- GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
13668
+ GGML_ASSERT(ctx->embd_size == embeddings_size);
12587
13669
 
12588
- if (embedding_size) {
12589
- memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
12590
- inp += embedding_size * sizeof(float);
13670
+ if (embeddings_size) {
13671
+ memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
13672
+ inp += embeddings_size * sizeof(float);
12591
13673
  }
12592
13674
  }
12593
13675
 
@@ -12595,12 +13677,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12595
13677
  {
12596
13678
  const auto & kv_self = ctx->kv_self;
12597
13679
  const auto & hparams = ctx->model.hparams;
12598
- const auto & cparams = ctx->cparams;
12599
13680
 
12600
13681
  const uint32_t n_layer = hparams.n_layer;
12601
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12602
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12603
- const uint32_t n_ctx = cparams.n_ctx;
13682
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
13683
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
12604
13684
 
12605
13685
  size_t kv_buf_size;
12606
13686
  uint32_t kv_head;
@@ -12621,9 +13701,19 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12621
13701
  ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
12622
13702
  inp += k_size;
12623
13703
 
13704
+ if (kv_self.recurrent) {
13705
+ // v is contiguous for recurrent models
13706
+ // TODO: use other tensors for state models than k and v
13707
+ const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
13708
+
13709
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size);
13710
+ inp += v_size;
13711
+ continue;
13712
+ }
13713
+
12624
13714
  // v is not contiguous, copy row by row
12625
13715
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12626
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
13716
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
12627
13717
 
12628
13718
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12629
13719
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
@@ -12632,13 +13722,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12632
13722
  }
12633
13723
  }
12634
13724
 
13725
+ GGML_ASSERT(kv_self.size == kv_size);
13726
+
12635
13727
  ctx->kv_self.head = kv_head;
12636
13728
  ctx->kv_self.size = kv_size;
12637
13729
  ctx->kv_self.used = kv_used;
12638
13730
 
12639
13731
  ctx->kv_self.cells.resize(kv_size);
12640
13732
 
12641
- for (uint32_t i = 0; i < kv_size; ++i) {
13733
+ for (uint32_t i = 0; i < kv_head; ++i) {
12642
13734
  llama_pos pos;
12643
13735
  size_t seq_id_size;
12644
13736
 
@@ -12654,6 +13746,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12654
13746
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
12655
13747
  }
12656
13748
  }
13749
+
13750
+ for (uint32_t i = kv_head; i < kv_size; ++i) {
13751
+ ctx->kv_self.cells[i].pos = -1;
13752
+ ctx->kv_self.cells[i].seq_id.clear();
13753
+ }
12657
13754
  }
12658
13755
 
12659
13756
  const size_t nread = inp - src;
@@ -12751,6 +13848,15 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
12751
13848
  ctx->cparams.n_threads_batch = n_threads_batch;
12752
13849
  }
12753
13850
 
13851
+ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
13852
+ ctx->abort_callback = abort_callback;
13853
+ ctx->abort_callback_data = abort_callback_data;
13854
+ }
13855
+
13856
+ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
13857
+ ctx->cparams.causal_attn = causal_attn;
13858
+ }
13859
+
12754
13860
  struct llama_batch llama_batch_get_one(
12755
13861
  llama_token * tokens,
12756
13862
  int32_t n_tokens,
@@ -12817,32 +13923,81 @@ int32_t llama_decode(
12817
13923
  return ret;
12818
13924
  }
12819
13925
 
13926
+ void llama_synchronize(struct llama_context * ctx) {
13927
+ ggml_backend_sched_synchronize(ctx->sched);
13928
+
13929
+ // FIXME: if multiple single tokens are evaluated without a synchronization,
13930
+ // the stats will be added to the prompt evaluation stats
13931
+ // this should only happen when using batch size 1 to evaluate a batch
13932
+
13933
+ // add the evaluation to the stats
13934
+ if (ctx->n_queued_tokens == 1) {
13935
+ ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
13936
+ ctx->n_eval++;
13937
+ } else if (ctx->n_queued_tokens > 1) {
13938
+ ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
13939
+ ctx->n_p_eval += ctx->n_queued_tokens;
13940
+ }
13941
+
13942
+ // get a more accurate load time, upon first eval
13943
+ if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
13944
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
13945
+ ctx->has_evaluated_once = true;
13946
+ }
13947
+
13948
+ ctx->n_queued_tokens = 0;
13949
+ ctx->t_compute_start_us = 0;
13950
+ }
13951
+
12820
13952
  float * llama_get_logits(struct llama_context * ctx) {
12821
- return ctx->logits.data();
13953
+ llama_synchronize(ctx);
13954
+
13955
+ return ctx->logits;
12822
13956
  }
12823
13957
 
12824
13958
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
12825
13959
  assert(ctx->logits_valid.at(i));
12826
- return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
13960
+
13961
+ llama_synchronize(ctx);
13962
+
13963
+ return ctx->logits + i*ctx->model.hparams.n_vocab;
12827
13964
  }
12828
13965
 
12829
13966
  float * llama_get_embeddings(struct llama_context * ctx) {
12830
- return ctx->embedding.data();
13967
+ llama_synchronize(ctx);
13968
+
13969
+ return ctx->embd;
12831
13970
  }
12832
13971
 
12833
13972
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12834
- return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
13973
+ llama_synchronize(ctx);
13974
+
13975
+ return ctx->embd + i*ctx->model.hparams.n_embd;
13976
+ }
13977
+
13978
+ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
13979
+ llama_synchronize(ctx);
13980
+
13981
+ auto it = ctx->embd_seq.find(seq_id);
13982
+ if (it == ctx->embd_seq.end()) {
13983
+ return nullptr;
13984
+ }
13985
+
13986
+ return it->second.data();
12835
13987
  }
12836
13988
 
12837
13989
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
13990
+ GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
12838
13991
  return model->vocab.id_to_token[token].text.c_str();
12839
13992
  }
12840
13993
 
12841
13994
  float llama_token_get_score(const struct llama_model * model, llama_token token) {
13995
+ GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
12842
13996
  return model->vocab.id_to_token[token].score;
12843
13997
  }
12844
13998
 
12845
13999
  llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
14000
+ GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
12846
14001
  return model->vocab.id_to_token[token].type;
12847
14002
  }
12848
14003
 
@@ -12887,12 +14042,12 @@ int32_t llama_tokenize(
12887
14042
  const char * text,
12888
14043
  int32_t text_len,
12889
14044
  llama_token * tokens,
12890
- int32_t n_max_tokens,
14045
+ int32_t n_tokens_max,
12891
14046
  bool add_bos,
12892
14047
  bool special) {
12893
14048
  auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
12894
14049
 
12895
- if (n_max_tokens < (int) res.size()) {
14050
+ if (n_tokens_max < (int) res.size()) {
12896
14051
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
12897
14052
  return -((int) res.size());
12898
14053
  }
@@ -12906,9 +14061,9 @@ int32_t llama_tokenize(
12906
14061
 
12907
14062
  static std::string llama_decode_text(const std::string & text) {
12908
14063
  std::string decoded_text;
12909
- auto unicode_sequences = codepoints_from_utf8(text);
12910
- for (auto& unicode_sequence : unicode_sequences) {
12911
- decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
14064
+ auto unicode_sequences = unicode_cpts_from_utf8(text);
14065
+ for (auto & unicode_sequence : unicode_sequences) {
14066
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
12912
14067
  }
12913
14068
 
12914
14069
  return decoded_text;
@@ -12933,7 +14088,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
12933
14088
  } else if (llama_is_user_defined_token(model->vocab, token)) {
12934
14089
  std::string result = model->vocab.id_to_token[token].text;
12935
14090
  if (length < (int) result.length()) {
12936
- return -result.length();
14091
+ return -(int) result.length();
12937
14092
  }
12938
14093
  memcpy(buf, result.c_str(), result.length());
12939
14094
  return result.length();
@@ -12968,7 +14123,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
12968
14123
  } else if (llama_is_user_defined_token(model->vocab, token)) {
12969
14124
  std::string result = model->vocab.id_to_token[token].text;
12970
14125
  if (length < (int) result.length()) {
12971
- return -result.length();
14126
+ return -(int) result.length();
12972
14127
  }
12973
14128
  memcpy(buf, result.c_str(), result.length());
12974
14129
  return result.length();
@@ -13005,7 +14160,7 @@ static int32_t llama_chat_apply_template_internal(
13005
14160
  std::string & dest, bool add_ass) {
13006
14161
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
13007
14162
  std::stringstream ss;
13008
- if (tmpl.find("<|im_start|>") != std::string::npos) {
14163
+ if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
13009
14164
  // chatml template
13010
14165
  for (auto message : chat) {
13011
14166
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -13013,7 +14168,7 @@ static int32_t llama_chat_apply_template_internal(
13013
14168
  if (add_ass) {
13014
14169
  ss << "<|im_start|>assistant\n";
13015
14170
  }
13016
- } else if (tmpl.find("[INST]") != std::string::npos) {
14171
+ } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
13017
14172
  // llama2 template and its variants
13018
14173
  // [variant] support system message
13019
14174
  bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
@@ -13048,7 +14203,7 @@ static int32_t llama_chat_apply_template_internal(
13048
14203
  }
13049
14204
  }
13050
14205
  // llama2 templates seem to not care about "add_generation_prompt"
13051
- } else if (tmpl.find("<|user|>") != std::string::npos) {
14206
+ } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
13052
14207
  // zephyr template
13053
14208
  for (auto message : chat) {
13054
14209
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -13056,7 +14211,7 @@ static int32_t llama_chat_apply_template_internal(
13056
14211
  if (add_ass) {
13057
14212
  ss << "<|assistant|>\n";
13058
14213
  }
13059
- } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
14214
+ } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
13060
14215
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
13061
14216
  for (auto message : chat) {
13062
14217
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -13065,7 +14220,7 @@ static int32_t llama_chat_apply_template_internal(
13065
14220
  if (add_ass) {
13066
14221
  ss << "<s>assistant\n";
13067
14222
  }
13068
- } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
14223
+ } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
13069
14224
  // google/gemma-7b-it
13070
14225
  std::string system_prompt = "";
13071
14226
  for (auto message : chat) {
@@ -13087,6 +14242,26 @@ static int32_t llama_chat_apply_template_internal(
13087
14242
  if (add_ass) {
13088
14243
  ss << "<start_of_turn>model\n";
13089
14244
  }
14245
+ } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
14246
+ // OrionStarAI/Orion-14B-Chat
14247
+ std::string system_prompt = "";
14248
+ for (auto message : chat) {
14249
+ std::string role(message->role);
14250
+ if (role == "system") {
14251
+ // there is no system message support, we will merge it with user prompt
14252
+ system_prompt = message->content;
14253
+ continue;
14254
+ } else if (role == "user") {
14255
+ ss << "Human: ";
14256
+ if (!system_prompt.empty()) {
14257
+ ss << system_prompt << "\n\n";
14258
+ system_prompt = "";
14259
+ }
14260
+ ss << message->content << "\n\nAssistant: </s>";
14261
+ } else {
14262
+ ss << message->content << "</s>";
14263
+ }
14264
+ }
13090
14265
  } else {
13091
14266
  // template not supported
13092
14267
  return -1;
@@ -13112,23 +14287,27 @@ LLAMA_API int32_t llama_chat_apply_template(
13112
14287
  int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
13113
14288
  if (res < 0) {
13114
14289
  // worst case: there is no information about template, we will use chatml by default
13115
- curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
14290
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
13116
14291
  } else {
13117
14292
  curr_tmpl = std::string(model_template.data(), model_template.size());
13118
14293
  }
13119
14294
  }
14295
+
13120
14296
  // format the chat to string
13121
14297
  std::vector<const llama_chat_message *> chat_vec;
13122
14298
  chat_vec.resize(n_msg);
13123
14299
  for (size_t i = 0; i < n_msg; i++) {
13124
14300
  chat_vec[i] = &chat[i];
13125
14301
  }
14302
+
13126
14303
  std::string formatted_chat;
13127
14304
  int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
13128
14305
  if (res < 0) {
13129
14306
  return res;
13130
14307
  }
13131
- strncpy(buf, formatted_chat.c_str(), length);
14308
+ if (buf && length > 0) {
14309
+ strncpy(buf, formatted_chat.c_str(), length);
14310
+ }
13132
14311
  return res;
13133
14312
  }
13134
14313