llama_cpp 0.12.7 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -68,10 +68,12 @@
|
|
68
68
|
#include <cstdio>
|
69
69
|
#include <cstring>
|
70
70
|
#include <ctime>
|
71
|
+
#include <cwctype>
|
71
72
|
#include <forward_list>
|
72
73
|
#include <fstream>
|
73
74
|
#include <functional>
|
74
75
|
#include <initializer_list>
|
76
|
+
#include <locale>
|
75
77
|
#include <map>
|
76
78
|
#include <memory>
|
77
79
|
#include <mutex>
|
@@ -102,6 +104,7 @@
|
|
102
104
|
#define LLAMA_MAX_NODES 8192
|
103
105
|
#define LLAMA_MAX_EXPERTS 8
|
104
106
|
|
107
|
+
|
105
108
|
//
|
106
109
|
// logging
|
107
110
|
//
|
@@ -209,10 +212,11 @@ enum llm_arch {
|
|
209
212
|
LLM_ARCH_INTERNLM2,
|
210
213
|
LLM_ARCH_MINICPM,
|
211
214
|
LLM_ARCH_GEMMA,
|
215
|
+
LLM_ARCH_STARCODER2,
|
212
216
|
LLM_ARCH_UNKNOWN,
|
213
217
|
};
|
214
218
|
|
215
|
-
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
219
|
+
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
216
220
|
{ LLM_ARCH_LLAMA, "llama" },
|
217
221
|
{ LLM_ARCH_FALCON, "falcon" },
|
218
222
|
{ LLM_ARCH_GPT2, "gpt2" },
|
@@ -236,6 +240,8 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
236
240
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
237
241
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
238
242
|
{ LLM_ARCH_GEMMA, "gemma" },
|
243
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
244
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
239
245
|
};
|
240
246
|
|
241
247
|
enum llm_kv {
|
@@ -296,7 +302,7 @@ enum llm_kv {
|
|
296
302
|
LLM_KV_TOKENIZER_RWKV,
|
297
303
|
};
|
298
304
|
|
299
|
-
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
305
|
+
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
300
306
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
301
307
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
302
308
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
@@ -360,7 +366,7 @@ struct LLM_KV {
|
|
360
366
|
llm_arch arch;
|
361
367
|
|
362
368
|
std::string operator()(llm_kv kv) const {
|
363
|
-
return ::format(LLM_KV_NAMES
|
369
|
+
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
364
370
|
}
|
365
371
|
};
|
366
372
|
|
@@ -395,7 +401,7 @@ enum llm_tensor {
|
|
395
401
|
LLM_TENSOR_LAYER_OUT_NORM,
|
396
402
|
};
|
397
403
|
|
398
|
-
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
404
|
+
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
399
405
|
{
|
400
406
|
LLM_ARCH_LLAMA,
|
401
407
|
{
|
@@ -777,6 +783,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
777
783
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
778
784
|
},
|
779
785
|
},
|
786
|
+
{
|
787
|
+
LLM_ARCH_STARCODER2,
|
788
|
+
{
|
789
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
790
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
791
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
792
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
793
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
794
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
795
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
796
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
797
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
798
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
799
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
800
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
801
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
802
|
+
},
|
803
|
+
},
|
780
804
|
{
|
781
805
|
LLM_ARCH_UNKNOWN,
|
782
806
|
{
|
@@ -810,38 +834,38 @@ struct LLM_TN {
|
|
810
834
|
llm_arch arch;
|
811
835
|
|
812
836
|
std::string operator()(llm_tensor tensor) const {
|
813
|
-
if (LLM_TENSOR_NAMES
|
837
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
814
838
|
return "__missing__";
|
815
839
|
}
|
816
|
-
return LLM_TENSOR_NAMES
|
840
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
817
841
|
}
|
818
842
|
|
819
843
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
820
|
-
if (LLM_TENSOR_NAMES
|
844
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
821
845
|
return "__missing__";
|
822
846
|
}
|
823
|
-
return LLM_TENSOR_NAMES
|
847
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
824
848
|
}
|
825
849
|
|
826
850
|
std::string operator()(llm_tensor tensor, int bid) const {
|
827
|
-
if (LLM_TENSOR_NAMES
|
851
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
828
852
|
return "__missing__";
|
829
853
|
}
|
830
|
-
return ::format(LLM_TENSOR_NAMES
|
854
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
|
831
855
|
}
|
832
856
|
|
833
857
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
834
|
-
if (LLM_TENSOR_NAMES
|
858
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
835
859
|
return "__missing__";
|
836
860
|
}
|
837
|
-
return ::format(LLM_TENSOR_NAMES
|
861
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
|
838
862
|
}
|
839
863
|
|
840
864
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
841
|
-
if (LLM_TENSOR_NAMES
|
865
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
842
866
|
return "__missing__";
|
843
867
|
}
|
844
|
-
return ::format(LLM_TENSOR_NAMES
|
868
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
|
845
869
|
}
|
846
870
|
};
|
847
871
|
|
@@ -849,20 +873,20 @@ struct LLM_TN {
|
|
849
873
|
// gguf helpers
|
850
874
|
//
|
851
875
|
|
852
|
-
static std::map<
|
853
|
-
{
|
854
|
-
{
|
855
|
-
{
|
876
|
+
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
877
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
878
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
879
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
856
880
|
};
|
857
881
|
|
858
|
-
static
|
882
|
+
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
859
883
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
860
884
|
if (kv.second == name) {
|
861
|
-
return kv.first;
|
885
|
+
return (llama_rope_scaling_type) kv.first;
|
862
886
|
}
|
863
887
|
}
|
864
888
|
|
865
|
-
return
|
889
|
+
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
866
890
|
}
|
867
891
|
|
868
892
|
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
@@ -1407,7 +1431,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1407
1431
|
buft = ggml_backend_cuda_host_buffer_type();
|
1408
1432
|
}
|
1409
1433
|
#elif defined(GGML_USE_SYCL)
|
1410
|
-
|
1434
|
+
if (host_buffer) {
|
1435
|
+
buft = ggml_backend_sycl_host_buffer_type();
|
1436
|
+
}
|
1411
1437
|
#elif defined(GGML_USE_CPU_HBM)
|
1412
1438
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
1413
1439
|
#elif defined(GGML_USE_VULKAN)
|
@@ -1461,6 +1487,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1461
1487
|
}
|
1462
1488
|
#endif
|
1463
1489
|
|
1490
|
+
#ifdef GGML_USE_SYCL
|
1491
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
1492
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1493
|
+
}
|
1494
|
+
#endif
|
1495
|
+
|
1464
1496
|
if (buft == nullptr) {
|
1465
1497
|
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1466
1498
|
}
|
@@ -1472,6 +1504,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1472
1504
|
static size_t llama_get_device_count() {
|
1473
1505
|
#if defined(GGML_USE_CUBLAS)
|
1474
1506
|
return ggml_backend_cuda_get_device_count();
|
1507
|
+
#elif defined(GGML_USE_SYCL)
|
1508
|
+
return ggml_backend_sycl_get_device_count();
|
1475
1509
|
#elif defined(GGML_USE_VULKAN)
|
1476
1510
|
return ggml_backend_vk_get_device_count();
|
1477
1511
|
#else
|
@@ -1485,6 +1519,11 @@ static size_t llama_get_device_memory(int device) {
|
|
1485
1519
|
size_t free;
|
1486
1520
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
1487
1521
|
return free;
|
1522
|
+
#elif defined(GGML_USE_SYCL)
|
1523
|
+
size_t total;
|
1524
|
+
size_t free;
|
1525
|
+
ggml_backend_sycl_get_device_memory(device, &total, &free);
|
1526
|
+
return free;
|
1488
1527
|
#elif defined(GGML_USE_VULKAN)
|
1489
1528
|
size_t total;
|
1490
1529
|
size_t free;
|
@@ -1550,8 +1589,9 @@ static const size_t MiB = 1024*kiB;
|
|
1550
1589
|
static const size_t GiB = 1024*MiB;
|
1551
1590
|
|
1552
1591
|
struct llama_hparams {
|
1553
|
-
bool
|
1554
|
-
bool
|
1592
|
+
bool vocab_only;
|
1593
|
+
bool rope_finetuned;
|
1594
|
+
|
1555
1595
|
uint32_t n_vocab;
|
1556
1596
|
uint32_t n_ctx_train; // context size the model was trained on
|
1557
1597
|
uint32_t n_embd;
|
@@ -1572,7 +1612,6 @@ struct llama_hparams {
|
|
1572
1612
|
float rope_freq_base_train;
|
1573
1613
|
float rope_freq_scale_train;
|
1574
1614
|
uint32_t n_yarn_orig_ctx;
|
1575
|
-
int32_t rope_scaling_type_train;
|
1576
1615
|
|
1577
1616
|
float f_clamp_kqv = 0.0f;
|
1578
1617
|
float f_max_alibi_bias = 0.0f;
|
@@ -1580,7 +1619,9 @@ struct llama_hparams {
|
|
1580
1619
|
bool causal_attn = true;
|
1581
1620
|
bool need_kq_pos = false;
|
1582
1621
|
|
1583
|
-
|
1622
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1623
|
+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
1624
|
+
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
1584
1625
|
|
1585
1626
|
bool operator!=(const llama_hparams & other) const {
|
1586
1627
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1624,13 +1665,13 @@ struct llama_hparams {
|
|
1624
1665
|
};
|
1625
1666
|
|
1626
1667
|
struct llama_cparams {
|
1627
|
-
uint32_t n_ctx;
|
1668
|
+
uint32_t n_ctx; // context size used during inference
|
1628
1669
|
uint32_t n_batch;
|
1629
1670
|
uint32_t n_threads; // number of threads to use for generation
|
1630
1671
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1631
1672
|
|
1632
|
-
float
|
1633
|
-
float
|
1673
|
+
float rope_freq_base;
|
1674
|
+
float rope_freq_scale;
|
1634
1675
|
|
1635
1676
|
uint32_t n_yarn_orig_ctx;
|
1636
1677
|
// These hyperparameters are not exposed in GGUF, because all
|
@@ -1639,10 +1680,12 @@ struct llama_cparams {
|
|
1639
1680
|
float yarn_attn_factor;
|
1640
1681
|
float yarn_beta_fast;
|
1641
1682
|
float yarn_beta_slow;
|
1683
|
+
float defrag_thold;
|
1642
1684
|
|
1643
|
-
bool
|
1685
|
+
bool embeddings;
|
1644
1686
|
bool offload_kqv;
|
1645
|
-
|
1687
|
+
|
1688
|
+
enum llama_pooling_type pooling_type;
|
1646
1689
|
|
1647
1690
|
ggml_backend_sched_eval_callback cb_eval;
|
1648
1691
|
void * cb_eval_user_data;
|
@@ -1707,11 +1750,20 @@ struct llama_kv_cell {
|
|
1707
1750
|
bool has_seq_id(const llama_seq_id & id) const {
|
1708
1751
|
return seq_id.find(id) != seq_id.end();
|
1709
1752
|
}
|
1753
|
+
|
1754
|
+
bool is_empty() const {
|
1755
|
+
return seq_id.empty();
|
1756
|
+
}
|
1757
|
+
|
1758
|
+
bool is_same_seq(const llama_kv_cell & other) const {
|
1759
|
+
return seq_id == other.seq_id;
|
1760
|
+
}
|
1710
1761
|
};
|
1711
1762
|
|
1712
1763
|
// ring-buffer of cached KV data
|
1713
1764
|
struct llama_kv_cache {
|
1714
1765
|
bool has_shift = false;
|
1766
|
+
bool do_defrag = false;
|
1715
1767
|
|
1716
1768
|
// Note: The value of head isn't only used to optimize searching
|
1717
1769
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -1723,6 +1775,9 @@ struct llama_kv_cache {
|
|
1723
1775
|
// computed before each graph build
|
1724
1776
|
uint32_t n = 0;
|
1725
1777
|
|
1778
|
+
ggml_type type_k = GGML_TYPE_F16;
|
1779
|
+
ggml_type type_v = GGML_TYPE_F16;
|
1780
|
+
|
1726
1781
|
std::vector<llama_kv_cell> cells;
|
1727
1782
|
|
1728
1783
|
std::vector<struct ggml_tensor *> k_l; // per layer
|
@@ -1919,7 +1974,7 @@ struct llama_context {
|
|
1919
1974
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1920
1975
|
int32_t n_eval = 0; // number of eval calls
|
1921
1976
|
|
1922
|
-
//
|
1977
|
+
// logits output (2-dimensional array: [n_tokens][n_vocab])
|
1923
1978
|
std::vector<float> logits;
|
1924
1979
|
#ifndef NDEBUG
|
1925
1980
|
// guard against access to unset logits
|
@@ -1927,13 +1982,21 @@ struct llama_context {
|
|
1927
1982
|
#endif
|
1928
1983
|
bool logits_all = false;
|
1929
1984
|
|
1930
|
-
//
|
1931
|
-
|
1985
|
+
// embeddings output (2-dimensional array: [n_tokens][n_embd])
|
1986
|
+
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
1987
|
+
std::vector<float> embd;
|
1988
|
+
|
1989
|
+
// sequence embeddings output (map of [n_embd] vectors)
|
1990
|
+
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
1991
|
+
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
1932
1992
|
|
1933
1993
|
// memory buffers used to evaluate the model
|
1934
1994
|
std::vector<uint8_t> buf_compute_meta;
|
1935
1995
|
ggml_backend_sched_t sched = nullptr;
|
1936
1996
|
|
1997
|
+
ggml_abort_callback abort_callback = nullptr;
|
1998
|
+
void * abort_callback_data = nullptr;
|
1999
|
+
|
1937
2000
|
// input tensors
|
1938
2001
|
ggml_backend_buffer_t buf_input = nullptr;
|
1939
2002
|
ggml_context * ctx_input = nullptr;
|
@@ -1958,8 +2021,8 @@ struct llama_context {
|
|
1958
2021
|
static bool llama_kv_cache_init(
|
1959
2022
|
struct llama_kv_cache & cache,
|
1960
2023
|
const llama_model & model,
|
1961
|
-
ggml_type
|
1962
|
-
ggml_type
|
2024
|
+
ggml_type type_k,
|
2025
|
+
ggml_type type_v,
|
1963
2026
|
uint32_t n_ctx,
|
1964
2027
|
bool offload) {
|
1965
2028
|
const struct llama_hparams & hparams = model.hparams;
|
@@ -1974,6 +2037,9 @@ static bool llama_kv_cache_init(
|
|
1974
2037
|
cache.size = n_ctx;
|
1975
2038
|
cache.used = 0;
|
1976
2039
|
|
2040
|
+
cache.type_k = type_k;
|
2041
|
+
cache.type_v = type_v;
|
2042
|
+
|
1977
2043
|
cache.cells.clear();
|
1978
2044
|
cache.cells.resize(n_ctx);
|
1979
2045
|
|
@@ -2014,8 +2080,8 @@ static bool llama_kv_cache_init(
|
|
2014
2080
|
|
2015
2081
|
for (int i = 0; i < (int) n_layer; i++) {
|
2016
2082
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
2017
|
-
ggml_tensor * k = ggml_new_tensor_1d(ctx,
|
2018
|
-
ggml_tensor * v = ggml_new_tensor_1d(ctx,
|
2083
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
|
2084
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
|
2019
2085
|
ggml_format_name(k, "cache_k_l%d", i);
|
2020
2086
|
ggml_format_name(v, "cache_v_l%d", i);
|
2021
2087
|
cache.k_l.push_back(k);
|
@@ -2097,10 +2163,12 @@ static bool llama_kv_cache_find_slot(
|
|
2097
2163
|
}
|
2098
2164
|
|
2099
2165
|
// find how many cells are currently in use
|
2100
|
-
static
|
2101
|
-
for (uint32_t i = cache.size
|
2102
|
-
|
2103
|
-
|
2166
|
+
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
2167
|
+
for (uint32_t i = cache.size; i > 0; --i) {
|
2168
|
+
const llama_kv_cell & cell = cache.cells[i - 1];
|
2169
|
+
|
2170
|
+
if (cell.pos >= 0 && !cell.is_empty()) {
|
2171
|
+
return i;
|
2104
2172
|
}
|
2105
2173
|
}
|
2106
2174
|
|
@@ -2135,7 +2203,7 @@ static void llama_kv_cache_seq_rm(
|
|
2135
2203
|
} else {
|
2136
2204
|
continue;
|
2137
2205
|
}
|
2138
|
-
if (cache.cells[i].
|
2206
|
+
if (cache.cells[i].is_empty()) {
|
2139
2207
|
// keep count of the number of used cells
|
2140
2208
|
if (cache.cells[i].pos >= 0) cache.used--;
|
2141
2209
|
|
@@ -2186,7 +2254,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
2186
2254
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
2187
2255
|
}
|
2188
2256
|
|
2189
|
-
static void
|
2257
|
+
static void llama_kv_cache_seq_add(
|
2190
2258
|
struct llama_kv_cache & cache,
|
2191
2259
|
llama_seq_id seq_id,
|
2192
2260
|
llama_pos p0,
|
@@ -2204,10 +2272,14 @@ static void llama_kv_cache_seq_shift(
|
|
2204
2272
|
cache.cells[i].delta += delta;
|
2205
2273
|
|
2206
2274
|
if (cache.cells[i].pos < 0) {
|
2207
|
-
if (!cache.cells[i].
|
2275
|
+
if (!cache.cells[i].is_empty()) {
|
2276
|
+
cache.used--;
|
2277
|
+
}
|
2208
2278
|
cache.cells[i].pos = -1;
|
2209
2279
|
cache.cells[i].seq_id.clear();
|
2210
|
-
if (new_head == cache.size)
|
2280
|
+
if (new_head == cache.size) {
|
2281
|
+
new_head = i;
|
2282
|
+
}
|
2211
2283
|
}
|
2212
2284
|
}
|
2213
2285
|
}
|
@@ -2239,6 +2311,22 @@ static void llama_kv_cache_seq_div(
|
|
2239
2311
|
}
|
2240
2312
|
}
|
2241
2313
|
|
2314
|
+
static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
2315
|
+
llama_pos result = 0;
|
2316
|
+
|
2317
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
2318
|
+
if (cache.cells[i].has_seq_id(seq_id)) {
|
2319
|
+
result = std::max(result, cache.cells[i].pos);
|
2320
|
+
}
|
2321
|
+
}
|
2322
|
+
|
2323
|
+
return result;
|
2324
|
+
}
|
2325
|
+
|
2326
|
+
static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
2327
|
+
cache.do_defrag = true;
|
2328
|
+
}
|
2329
|
+
|
2242
2330
|
//
|
2243
2331
|
// model loading and saving
|
2244
2332
|
//
|
@@ -2310,7 +2398,7 @@ namespace GGUFMeta {
|
|
2310
2398
|
}
|
2311
2399
|
};
|
2312
2400
|
|
2313
|
-
struct ArrayInfo{
|
2401
|
+
struct ArrayInfo {
|
2314
2402
|
const gguf_type gt;
|
2315
2403
|
const size_t length;
|
2316
2404
|
const void * data;
|
@@ -2329,7 +2417,7 @@ namespace GGUFMeta {
|
|
2329
2417
|
};
|
2330
2418
|
|
2331
2419
|
template<typename T>
|
2332
|
-
class GKV: public GKV_Base<T> {
|
2420
|
+
class GKV : public GKV_Base<T> {
|
2333
2421
|
GKV() = delete;
|
2334
2422
|
|
2335
2423
|
public:
|
@@ -2345,46 +2433,46 @@ namespace GGUFMeta {
|
|
2345
2433
|
|
2346
2434
|
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
2347
2435
|
switch (ty) {
|
2348
|
-
case
|
2349
|
-
case
|
2350
|
-
case
|
2436
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2437
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2438
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2351
2439
|
}
|
2352
2440
|
return "unknown";
|
2353
2441
|
}
|
2354
2442
|
|
2355
|
-
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *
|
2356
|
-
if (!
|
2357
|
-
if (
|
2443
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
|
2444
|
+
if (!ovrd) { return false; }
|
2445
|
+
if (ovrd->tag == expected_type) {
|
2358
2446
|
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
2359
|
-
__func__, override_type_to_str(
|
2360
|
-
switch (
|
2361
|
-
case
|
2362
|
-
LLAMA_LOG_INFO("%s\n",
|
2447
|
+
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2448
|
+
switch (ovrd->tag) {
|
2449
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2450
|
+
LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
|
2363
2451
|
} break;
|
2364
|
-
case
|
2365
|
-
LLAMA_LOG_INFO("%" PRId64 "\n",
|
2452
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2453
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
|
2366
2454
|
} break;
|
2367
|
-
case
|
2368
|
-
LLAMA_LOG_INFO("%.6f\n",
|
2455
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2456
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
|
2369
2457
|
} break;
|
2370
2458
|
default:
|
2371
2459
|
// Shouldn't be possible to end up here, but just in case...
|
2372
2460
|
throw std::runtime_error(
|
2373
2461
|
format("Unsupported attempt to override %s type for metadata key %s\n",
|
2374
|
-
override_type_to_str(
|
2462
|
+
override_type_to_str(ovrd->tag), ovrd->key));
|
2375
2463
|
}
|
2376
2464
|
return true;
|
2377
2465
|
}
|
2378
2466
|
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
2379
|
-
__func__,
|
2467
|
+
__func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
|
2380
2468
|
return false;
|
2381
2469
|
}
|
2382
2470
|
|
2383
2471
|
template<typename OT>
|
2384
2472
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2385
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
2386
|
-
if (validate_override(
|
2387
|
-
target =
|
2473
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2474
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2475
|
+
target = ovrd->bool_value;
|
2388
2476
|
return true;
|
2389
2477
|
}
|
2390
2478
|
return false;
|
@@ -2392,9 +2480,9 @@ namespace GGUFMeta {
|
|
2392
2480
|
|
2393
2481
|
template<typename OT>
|
2394
2482
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2395
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
2396
|
-
if (validate_override(
|
2397
|
-
target =
|
2483
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2484
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2485
|
+
target = ovrd->int_value;
|
2398
2486
|
return true;
|
2399
2487
|
}
|
2400
2488
|
return false;
|
@@ -2402,9 +2490,9 @@ namespace GGUFMeta {
|
|
2402
2490
|
|
2403
2491
|
template<typename OT>
|
2404
2492
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2405
|
-
try_override(T & target, const struct llama_model_kv_override *
|
2406
|
-
if (validate_override(
|
2407
|
-
target =
|
2493
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2494
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2495
|
+
target = ovrd->float_value;
|
2408
2496
|
return true;
|
2409
2497
|
}
|
2410
2498
|
return false;
|
@@ -2412,17 +2500,17 @@ namespace GGUFMeta {
|
|
2412
2500
|
|
2413
2501
|
template<typename OT>
|
2414
2502
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2415
|
-
try_override(T & target, const struct llama_model_kv_override *
|
2503
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2416
2504
|
(void)target;
|
2417
|
-
(void)
|
2418
|
-
if (!
|
2505
|
+
(void)ovrd;
|
2506
|
+
if (!ovrd) { return false; }
|
2419
2507
|
// Currently, we should never end up here so it would be a bug if we do.
|
2420
2508
|
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
2421
|
-
|
2509
|
+
ovrd ? ovrd->key : "NULL"));
|
2422
2510
|
}
|
2423
2511
|
|
2424
|
-
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *
|
2425
|
-
if (try_override<T>(target,
|
2512
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2513
|
+
if (try_override<T>(target, ovrd)) {
|
2426
2514
|
return true;
|
2427
2515
|
}
|
2428
2516
|
if (k < 0) { return false; }
|
@@ -2430,12 +2518,12 @@ namespace GGUFMeta {
|
|
2430
2518
|
return true;
|
2431
2519
|
}
|
2432
2520
|
|
2433
|
-
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *
|
2434
|
-
return set(ctx, gguf_find_key(ctx, key), target,
|
2521
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2522
|
+
return set(ctx, gguf_find_key(ctx, key), target, ovrd);
|
2435
2523
|
}
|
2436
2524
|
|
2437
|
-
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *
|
2438
|
-
return set(ctx, key.c_str(), target,
|
2525
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2526
|
+
return set(ctx, key.c_str(), target, ovrd);
|
2439
2527
|
}
|
2440
2528
|
};
|
2441
2529
|
}
|
@@ -2542,9 +2630,12 @@ struct llama_model_loader {
|
|
2542
2630
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2543
2631
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2544
2632
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2633
|
+
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2545
2634
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2546
2635
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
2547
2636
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2637
|
+
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2638
|
+
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
2548
2639
|
default:
|
2549
2640
|
{
|
2550
2641
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2845,6 +2936,19 @@ struct llama_model_loader {
|
|
2845
2936
|
}
|
2846
2937
|
};
|
2847
2938
|
|
2939
|
+
template<>
|
2940
|
+
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
2941
|
+
uint32_t tmp;
|
2942
|
+
const bool found = get_key(kid, tmp, required);
|
2943
|
+
if (found) {
|
2944
|
+
result = (enum llama_pooling_type) tmp;
|
2945
|
+
} else {
|
2946
|
+
result = LLAMA_POOLING_TYPE_UNSPECIFIED;
|
2947
|
+
}
|
2948
|
+
return found;
|
2949
|
+
}
|
2950
|
+
|
2951
|
+
|
2848
2952
|
//
|
2849
2953
|
// load LLaMA models
|
2850
2954
|
//
|
@@ -2886,10 +2990,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2886
2990
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2887
2991
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
2888
2992
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2889
|
-
case
|
2993
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
2994
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
2995
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
2890
2996
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2891
2997
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
2892
2998
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
2999
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
3000
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
3001
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
2893
3002
|
|
2894
3003
|
default: return "unknown, may not work";
|
2895
3004
|
}
|
@@ -2923,16 +3032,16 @@ static const char * llama_model_type_name(e_model type) {
|
|
2923
3032
|
default: return "?B";
|
2924
3033
|
}
|
2925
3034
|
}
|
3035
|
+
|
2926
3036
|
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2927
3037
|
switch (type) {
|
2928
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
2929
|
-
case LLAMA_VOCAB_TYPE_BPE:
|
2930
|
-
case LLAMA_VOCAB_TYPE_WPM:
|
2931
|
-
default:
|
3038
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
3039
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
3040
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
3041
|
+
default: return "unknown";
|
2932
3042
|
}
|
2933
3043
|
}
|
2934
3044
|
|
2935
|
-
|
2936
3045
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
2937
3046
|
model.arch = ml.get_arch();
|
2938
3047
|
if (model.arch == LLM_ARCH_UNKNOWN) {
|
@@ -2996,7 +3105,7 @@ static void llm_load_hparams(
|
|
2996
3105
|
std::string rope_scaling("linear");
|
2997
3106
|
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
2998
3107
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
2999
|
-
GGML_ASSERT(hparams.rope_scaling_type_train !=
|
3108
|
+
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
3000
3109
|
|
3001
3110
|
// rope_freq_scale (inverse of the kv) is optional
|
3002
3111
|
float ropescale = 0.0f;
|
@@ -3109,10 +3218,10 @@ static void llm_load_hparams(
|
|
3109
3218
|
} break;
|
3110
3219
|
case LLM_ARCH_BERT:
|
3111
3220
|
{
|
3112
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
3113
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
3221
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3222
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3114
3223
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3115
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
3224
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
3116
3225
|
|
3117
3226
|
switch (hparams.n_layer) {
|
3118
3227
|
case 3:
|
@@ -3130,10 +3239,10 @@ static void llm_load_hparams(
|
|
3130
3239
|
} break;
|
3131
3240
|
case LLM_ARCH_NOMIC_BERT:
|
3132
3241
|
{
|
3133
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
3134
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
3242
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3243
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3135
3244
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3136
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
3245
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3137
3246
|
|
3138
3247
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
3139
3248
|
model.type = e_model::MODEL_137M;
|
@@ -3264,6 +3373,16 @@ static void llm_load_hparams(
|
|
3264
3373
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3265
3374
|
}
|
3266
3375
|
} break;
|
3376
|
+
case LLM_ARCH_STARCODER2:
|
3377
|
+
{
|
3378
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3379
|
+
switch (hparams.n_layer) {
|
3380
|
+
case 30: model.type = e_model::MODEL_3B; break;
|
3381
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3382
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
3383
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3384
|
+
}
|
3385
|
+
} break;
|
3267
3386
|
default: (void)0;
|
3268
3387
|
}
|
3269
3388
|
|
@@ -3272,6 +3391,8 @@ static void llm_load_hparams(
|
|
3272
3391
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
3273
3392
|
hparams.need_kq_pos = true;
|
3274
3393
|
}
|
3394
|
+
|
3395
|
+
hparams.rope_type = llama_rope_type(&model);
|
3275
3396
|
}
|
3276
3397
|
|
3277
3398
|
// TODO: This should probably be in llama.h
|
@@ -3574,6 +3695,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3574
3695
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3575
3696
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3576
3697
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
3698
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
3699
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
3577
3700
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
3578
3701
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
3579
3702
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -3640,7 +3763,7 @@ static bool llm_load_tensors(
|
|
3640
3763
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
3641
3764
|
}
|
3642
3765
|
|
3643
|
-
if (split_mode ==
|
3766
|
+
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
3644
3767
|
// calculate the split points
|
3645
3768
|
int device_count = llama_get_device_count();
|
3646
3769
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
@@ -3679,10 +3802,10 @@ static bool llm_load_tensors(
|
|
3679
3802
|
}
|
3680
3803
|
} else {
|
3681
3804
|
ggml_backend_buffer_type_t split_buft;
|
3682
|
-
if (split_mode ==
|
3805
|
+
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
3683
3806
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
3684
3807
|
} else {
|
3685
|
-
//
|
3808
|
+
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
3686
3809
|
split_buft = llama_default_buffer_type_offload(main_gpu);
|
3687
3810
|
}
|
3688
3811
|
// assign the repeating layers
|
@@ -4430,6 +4553,56 @@ static bool llm_load_tensors(
|
|
4430
4553
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4431
4554
|
}
|
4432
4555
|
} break;
|
4556
|
+
case LLM_ARCH_STARCODER2:
|
4557
|
+
{
|
4558
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4559
|
+
|
4560
|
+
// output
|
4561
|
+
{
|
4562
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4563
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4564
|
+
|
4565
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4566
|
+
// if output is NULL, init from the input tok embed
|
4567
|
+
if (model.output == NULL) {
|
4568
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4569
|
+
ml.n_created--; // artificial tensor
|
4570
|
+
ml.size_data += ggml_nbytes(model.output);
|
4571
|
+
}
|
4572
|
+
|
4573
|
+
}
|
4574
|
+
|
4575
|
+
for (int i = 0; i < n_layer; ++i) {
|
4576
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4577
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4578
|
+
|
4579
|
+
auto & layer = model.layers[i];
|
4580
|
+
|
4581
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4582
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4583
|
+
|
4584
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4585
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4586
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4587
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4588
|
+
|
4589
|
+
// optional bias tensors
|
4590
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
4591
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
4592
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
4593
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
4594
|
+
|
4595
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4596
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
4597
|
+
|
4598
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4599
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4600
|
+
|
4601
|
+
// optional bias tensors
|
4602
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
4603
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
|
4604
|
+
}
|
4605
|
+
} break;
|
4433
4606
|
default:
|
4434
4607
|
throw std::runtime_error("unknown architecture");
|
4435
4608
|
}
|
@@ -4595,12 +4768,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4595
4768
|
|
4596
4769
|
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
4597
4770
|
|
4598
|
-
enum llm_rope_type {
|
4599
|
-
LLM_ROPE,
|
4600
|
-
LLM_ROPE_NEOX,
|
4601
|
-
LLM_ROPE_GLM,
|
4602
|
-
};
|
4603
|
-
|
4604
4771
|
enum llm_ffn_op_type {
|
4605
4772
|
LLM_FFN_SILU,
|
4606
4773
|
LLM_FFN_GELU,
|
@@ -4646,55 +4813,6 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
4646
4813
|
return inpL;
|
4647
4814
|
}
|
4648
4815
|
|
4649
|
-
// Persimmon: n_rot = n_embd_head_k/2
|
4650
|
-
// Other: n_rot = n_embd_head_k
|
4651
|
-
static void llm_build_k_shift(
|
4652
|
-
struct ggml_context * ctx,
|
4653
|
-
const llama_hparams & hparams,
|
4654
|
-
const llama_cparams & cparams,
|
4655
|
-
const llama_kv_cache & kv,
|
4656
|
-
struct ggml_cgraph * graph,
|
4657
|
-
struct ggml_tensor * K_shift,
|
4658
|
-
llm_rope_type type,
|
4659
|
-
int64_t n_ctx,
|
4660
|
-
float freq_base,
|
4661
|
-
float freq_scale,
|
4662
|
-
const llm_build_cb & cb) {
|
4663
|
-
const int64_t n_layer = hparams.n_layer;
|
4664
|
-
const int64_t n_head_kv = hparams.n_head_kv;
|
4665
|
-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4666
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4667
|
-
const int32_t n_rot = hparams.n_rot;
|
4668
|
-
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
4669
|
-
const float ext_factor = cparams.yarn_ext_factor;
|
4670
|
-
const float attn_factor = cparams.yarn_attn_factor;
|
4671
|
-
const float beta_fast = cparams.yarn_beta_fast;
|
4672
|
-
const float beta_slow = cparams.yarn_beta_slow;
|
4673
|
-
|
4674
|
-
int rope_type = 0;
|
4675
|
-
|
4676
|
-
switch (type) {
|
4677
|
-
case LLM_ROPE: rope_type = 0; break;
|
4678
|
-
case LLM_ROPE_NEOX: rope_type = 2; break;
|
4679
|
-
case LLM_ROPE_GLM: rope_type = 4; break;
|
4680
|
-
}
|
4681
|
-
|
4682
|
-
for (int il = 0; il < n_layer; ++il) {
|
4683
|
-
struct ggml_tensor * tmp =
|
4684
|
-
// we rotate only the first n_rot dimensions
|
4685
|
-
ggml_rope_custom_inplace(ctx,
|
4686
|
-
ggml_view_3d(ctx, kv.k_l[il],
|
4687
|
-
n_embd_head_k, n_head_kv, n_ctx,
|
4688
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
4689
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
4690
|
-
0),
|
4691
|
-
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
4692
|
-
ext_factor, attn_factor, beta_fast, beta_slow);
|
4693
|
-
cb(tmp, "K_shifted", il);
|
4694
|
-
ggml_build_forward_expand(graph, tmp);
|
4695
|
-
}
|
4696
|
-
}
|
4697
|
-
|
4698
4816
|
static void llm_build_kv_store(
|
4699
4817
|
struct ggml_context * ctx,
|
4700
4818
|
const llama_hparams & hparams,
|
@@ -4896,8 +5014,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4896
5014
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4897
5015
|
}
|
4898
5016
|
|
4899
|
-
#if defined(
|
4900
|
-
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for
|
5017
|
+
#if defined(GGML_USE_KOMPUTE)
|
5018
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
4901
5019
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4902
5020
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4903
5021
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
@@ -4981,6 +5099,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
4981
5099
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
4982
5100
|
|
4983
5101
|
struct ggml_tensor * cur;
|
5102
|
+
|
4984
5103
|
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
4985
5104
|
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
4986
5105
|
cb(cur, "kqv_out", il);
|
@@ -4998,6 +5117,7 @@ struct llm_build_context {
|
|
4998
5117
|
|
4999
5118
|
const int64_t n_embd;
|
5000
5119
|
const int64_t n_layer;
|
5120
|
+
const int64_t n_rot;
|
5001
5121
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
5002
5122
|
const int64_t n_head;
|
5003
5123
|
const int64_t n_head_kv;
|
@@ -5022,8 +5142,8 @@ struct llm_build_context {
|
|
5022
5142
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5023
5143
|
const int32_t n_orig_ctx;
|
5024
5144
|
|
5025
|
-
const
|
5026
|
-
const
|
5145
|
+
const enum llama_pooling_type pooling_type;
|
5146
|
+
const enum llama_rope_type rope_type;
|
5027
5147
|
|
5028
5148
|
const llm_build_cb & cb;
|
5029
5149
|
|
@@ -5045,6 +5165,7 @@ struct llm_build_context {
|
|
5045
5165
|
kv_self (lctx.kv_self),
|
5046
5166
|
n_embd (hparams.n_embd),
|
5047
5167
|
n_layer (hparams.n_layer),
|
5168
|
+
n_rot (hparams.n_rot),
|
5048
5169
|
n_ctx (cparams.n_ctx),
|
5049
5170
|
n_head (hparams.n_head),
|
5050
5171
|
n_head_kv (hparams.n_head_kv),
|
@@ -5066,8 +5187,8 @@ struct llm_build_context {
|
|
5066
5187
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
5067
5188
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
5068
5189
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5069
|
-
|
5070
|
-
|
5190
|
+
pooling_type (cparams.pooling_type),
|
5191
|
+
rope_type (hparams.rope_type),
|
5071
5192
|
cb (cb),
|
5072
5193
|
buf_compute_meta (lctx.buf_compute_meta) {
|
5073
5194
|
// all initializations should be done in init()
|
@@ -5090,6 +5211,76 @@ struct llm_build_context {
|
|
5090
5211
|
}
|
5091
5212
|
}
|
5092
5213
|
|
5214
|
+
struct ggml_cgraph * build_k_shift() {
|
5215
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5216
|
+
|
5217
|
+
for (int il = 0; il < n_layer; ++il) {
|
5218
|
+
struct ggml_tensor * tmp =
|
5219
|
+
// we rotate only the first n_rot dimensions
|
5220
|
+
ggml_rope_custom_inplace(ctx0,
|
5221
|
+
ggml_view_3d(ctx0, kv_self.k_l[il],
|
5222
|
+
n_embd_head_k, n_head_kv, n_ctx,
|
5223
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
5224
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5225
|
+
0),
|
5226
|
+
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5227
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
5228
|
+
cb(tmp, "K_shifted", il);
|
5229
|
+
ggml_build_forward_expand(gf, tmp);
|
5230
|
+
}
|
5231
|
+
|
5232
|
+
return gf;
|
5233
|
+
}
|
5234
|
+
|
5235
|
+
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
5236
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5237
|
+
|
5238
|
+
for (uint32_t i = 0; i < ids.size(); ++i) {
|
5239
|
+
const uint32_t id = ids[i];
|
5240
|
+
|
5241
|
+
if (i == id || id == ids.size()) {
|
5242
|
+
continue;
|
5243
|
+
}
|
5244
|
+
|
5245
|
+
uint32_t nm = 1;
|
5246
|
+
|
5247
|
+
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
5248
|
+
nm++;
|
5249
|
+
}
|
5250
|
+
|
5251
|
+
for (int il = 0; il < n_layer; ++il) {
|
5252
|
+
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
5253
|
+
n_embd_k_gqa, nm,
|
5254
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5255
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
5256
|
+
|
5257
|
+
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
5258
|
+
n_embd_k_gqa, nm,
|
5259
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5260
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
5261
|
+
|
5262
|
+
ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
5263
|
+
nm, n_embd_v_gqa,
|
5264
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
5265
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
5266
|
+
|
5267
|
+
ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
5268
|
+
nm, n_embd_v_gqa,
|
5269
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
5270
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
5271
|
+
|
5272
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
5273
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
5274
|
+
}
|
5275
|
+
|
5276
|
+
i += nm - 1;
|
5277
|
+
}
|
5278
|
+
|
5279
|
+
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
5280
|
+
|
5281
|
+
return gf;
|
5282
|
+
}
|
5283
|
+
|
5093
5284
|
struct ggml_cgraph * build_llama() {
|
5094
5285
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5095
5286
|
|
@@ -5111,11 +5302,6 @@ struct llm_build_context {
|
|
5111
5302
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5112
5303
|
cb(KQ_mask, "KQ_mask", -1);
|
5113
5304
|
|
5114
|
-
// shift the entire K-cache if needed
|
5115
|
-
if (do_rope_shift) {
|
5116
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5117
|
-
}
|
5118
|
-
|
5119
5305
|
for (int il = 0; il < n_layer; ++il) {
|
5120
5306
|
struct ggml_tensor * inpSA = inpL;
|
5121
5307
|
|
@@ -5151,14 +5337,14 @@ struct llm_build_context {
|
|
5151
5337
|
|
5152
5338
|
Qcur = ggml_rope_custom(
|
5153
5339
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5154
|
-
|
5340
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5155
5341
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5156
5342
|
);
|
5157
5343
|
cb(Qcur, "Qcur", il);
|
5158
5344
|
|
5159
5345
|
Kcur = ggml_rope_custom(
|
5160
5346
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5161
|
-
|
5347
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5162
5348
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5163
5349
|
);
|
5164
5350
|
cb(Kcur, "Kcur", il);
|
@@ -5299,11 +5485,6 @@ struct llm_build_context {
|
|
5299
5485
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5300
5486
|
cb(KQ_pos, "KQ_pos", -1);
|
5301
5487
|
|
5302
|
-
// shift the entire K-cache if needed
|
5303
|
-
if (do_rope_shift) {
|
5304
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5305
|
-
}
|
5306
|
-
|
5307
5488
|
for (int il = 0; il < n_layer; ++il) {
|
5308
5489
|
struct ggml_tensor * inpSA = inpL;
|
5309
5490
|
|
@@ -5327,12 +5508,12 @@ struct llm_build_context {
|
|
5327
5508
|
case MODEL_7B:
|
5328
5509
|
Qcur = ggml_rope_custom(
|
5329
5510
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5330
|
-
|
5511
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5331
5512
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5332
5513
|
);
|
5333
5514
|
Kcur = ggml_rope_custom(
|
5334
5515
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5335
|
-
|
5516
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5336
5517
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5337
5518
|
);
|
5338
5519
|
break;
|
@@ -5417,11 +5598,6 @@ struct llm_build_context {
|
|
5417
5598
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5418
5599
|
cb(KQ_mask, "KQ_mask", -1);
|
5419
5600
|
|
5420
|
-
// shift the entire K-cache if needed
|
5421
|
-
if (do_rope_shift) {
|
5422
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5423
|
-
}
|
5424
|
-
|
5425
5601
|
for (int il = 0; il < n_layer; ++il) {
|
5426
5602
|
struct ggml_tensor * attn_norm;
|
5427
5603
|
|
@@ -5460,13 +5636,13 @@ struct llm_build_context {
|
|
5460
5636
|
|
5461
5637
|
// using mode = 2 for neox mode
|
5462
5638
|
Qcur = ggml_rope_custom(
|
5463
|
-
ctx0, Qcur, inp_pos,
|
5639
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5464
5640
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5465
5641
|
);
|
5466
5642
|
cb(Qcur, "Qcur", il);
|
5467
5643
|
|
5468
5644
|
Kcur = ggml_rope_custom(
|
5469
|
-
ctx0, Kcur, inp_pos,
|
5645
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5470
5646
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5471
5647
|
);
|
5472
5648
|
cb(Kcur, "Kcur", il);
|
@@ -5636,10 +5812,6 @@ struct llm_build_context {
|
|
5636
5812
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5637
5813
|
cb(KQ_mask, "KQ_mask", -1);
|
5638
5814
|
|
5639
|
-
if (do_rope_shift) {
|
5640
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5641
|
-
}
|
5642
|
-
|
5643
5815
|
for (int il = 0; il < n_layer; ++il) {
|
5644
5816
|
struct ggml_tensor * residual = inpL;
|
5645
5817
|
|
@@ -5697,7 +5869,7 @@ struct llm_build_context {
|
|
5697
5869
|
|
5698
5870
|
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
5699
5871
|
struct ggml_tensor * qrot = ggml_view_3d(
|
5700
|
-
ctx0, tmpq,
|
5872
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5701
5873
|
ggml_element_size(tmpq) * n_embd_head,
|
5702
5874
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5703
5875
|
0
|
@@ -5705,7 +5877,7 @@ struct llm_build_context {
|
|
5705
5877
|
cb(qrot, "qrot", il);
|
5706
5878
|
|
5707
5879
|
struct ggml_tensor * krot = ggml_view_3d(
|
5708
|
-
ctx0, tmpk,
|
5880
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5709
5881
|
ggml_element_size(tmpk) * n_embd_head,
|
5710
5882
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5711
5883
|
0
|
@@ -5714,29 +5886,29 @@ struct llm_build_context {
|
|
5714
5886
|
|
5715
5887
|
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
5716
5888
|
struct ggml_tensor * qpass = ggml_view_3d(
|
5717
|
-
ctx0, tmpq,
|
5889
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5718
5890
|
ggml_element_size(tmpq) * n_embd_head,
|
5719
5891
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5720
|
-
ggml_element_size(tmpq) *
|
5892
|
+
ggml_element_size(tmpq) * n_rot
|
5721
5893
|
);
|
5722
5894
|
cb(qpass, "qpass", il);
|
5723
5895
|
|
5724
5896
|
struct ggml_tensor * kpass = ggml_view_3d(
|
5725
|
-
ctx0, tmpk,
|
5897
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5726
5898
|
ggml_element_size(tmpk) * n_embd_head,
|
5727
5899
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5728
|
-
ggml_element_size(tmpk) *
|
5900
|
+
ggml_element_size(tmpk) * n_rot
|
5729
5901
|
);
|
5730
5902
|
cb(kpass, "kpass", il);
|
5731
5903
|
|
5732
5904
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
5733
|
-
ctx0, qrot, inp_pos,
|
5905
|
+
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5734
5906
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5735
5907
|
);
|
5736
5908
|
cb(qrotated, "qrotated", il);
|
5737
5909
|
|
5738
5910
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
5739
|
-
ctx0, krot, inp_pos,
|
5911
|
+
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5740
5912
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5741
5913
|
);
|
5742
5914
|
cb(krotated, "krotated", il);
|
@@ -5921,6 +6093,7 @@ struct llm_build_context {
|
|
5921
6093
|
|
5922
6094
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5923
6095
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
6096
|
+
|
5924
6097
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5925
6098
|
|
5926
6099
|
struct ggml_tensor * cur;
|
@@ -5928,9 +6101,10 @@ struct llm_build_context {
|
|
5928
6101
|
|
5929
6102
|
// get input vectors with right size
|
5930
6103
|
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
5931
|
-
|
6104
|
+
|
6105
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5932
6106
|
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
5933
|
-
struct ggml_tensor * inp_cls
|
6107
|
+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
5934
6108
|
|
5935
6109
|
// construct input embeddings (token, type, position)
|
5936
6110
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
@@ -5948,39 +6122,38 @@ struct llm_build_context {
|
|
5948
6122
|
cb(inpL, "inp_norm", -1);
|
5949
6123
|
|
5950
6124
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5951
|
-
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask,
|
5952
|
-
cb(KQ_mask, "KQ_mask", -1); // [
|
6125
|
+
struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
|
6126
|
+
cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
|
5953
6127
|
|
5954
6128
|
// iterate layers
|
5955
6129
|
for (int il = 0; il < n_layer; ++il) {
|
5956
6130
|
struct ggml_tensor * cur = inpL;
|
5957
6131
|
|
6132
|
+
struct ggml_tensor * Qcur;
|
6133
|
+
struct ggml_tensor * Kcur;
|
6134
|
+
struct ggml_tensor * Vcur;
|
6135
|
+
|
5958
6136
|
// self-attention
|
5959
6137
|
if (model.arch == LLM_ARCH_BERT) {
|
5960
|
-
|
6138
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
5961
6139
|
cb(Qcur, "Qcur", il);
|
5962
6140
|
|
5963
|
-
|
6141
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
5964
6142
|
cb(Kcur, "Kcur", il);
|
5965
6143
|
|
5966
|
-
|
6144
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
5967
6145
|
cb(Vcur, "Vcur", il);
|
5968
6146
|
|
5969
|
-
|
5970
|
-
|
5971
|
-
|
5972
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5973
|
-
model.layers[il].wo, model.layers[il].bo,
|
5974
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5975
|
-
cb(cur, "kqv_out", il);
|
6147
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
6148
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5976
6149
|
} else {
|
5977
6150
|
// compute Q and K and RoPE them
|
5978
6151
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5979
6152
|
cb(cur, "wqkv", il);
|
5980
6153
|
|
5981
|
-
|
5982
|
-
|
5983
|
-
|
6154
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
6155
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
6156
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5984
6157
|
|
5985
6158
|
cb(Qcur, "Qcur", il);
|
5986
6159
|
cb(Kcur, "Kcur", il);
|
@@ -5988,24 +6161,52 @@ struct llm_build_context {
|
|
5988
6161
|
|
5989
6162
|
Qcur = ggml_rope_custom(
|
5990
6163
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5991
|
-
|
6164
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5992
6165
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5993
6166
|
);
|
5994
6167
|
cb(Qcur, "Qcur", il);
|
5995
6168
|
|
5996
6169
|
Kcur = ggml_rope_custom(
|
5997
6170
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5998
|
-
|
6171
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5999
6172
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6000
6173
|
);
|
6001
6174
|
cb(Kcur, "Kcur", il);
|
6175
|
+
}
|
6002
6176
|
|
6003
|
-
|
6004
|
-
|
6005
|
-
|
6006
|
-
|
6177
|
+
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
6178
|
+
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
6179
|
+
|
6180
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
6181
|
+
cb(kq, "kq", il);
|
6182
|
+
|
6183
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
6184
|
+
cb(kq, "kq_soft_max_ext", il);
|
6185
|
+
|
6186
|
+
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
6187
|
+
cb(v, "v", il);
|
6188
|
+
|
6189
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
|
6190
|
+
cb(kqv, "kqv", il);
|
6191
|
+
|
6192
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
6193
|
+
cb(kqv_merged, "kqv_merged", il);
|
6194
|
+
|
6195
|
+
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
6196
|
+
cb(cur, "kqv_merged_cont", il);
|
6197
|
+
|
6198
|
+
ggml_build_forward_expand(gf, cur);
|
6199
|
+
|
6200
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
6201
|
+
if (model.layers[il].bo) {
|
6202
|
+
cb(cur, "kqv_wo", il);
|
6007
6203
|
}
|
6008
6204
|
|
6205
|
+
if (model.layers[il].bo) {
|
6206
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
6207
|
+
}
|
6208
|
+
cb(cur, "kqv_out", il);
|
6209
|
+
|
6009
6210
|
// re-add the layer input
|
6010
6211
|
cur = ggml_add(ctx0, cur, inpL);
|
6011
6212
|
|
@@ -6045,16 +6246,29 @@ struct llm_build_context {
|
|
6045
6246
|
|
6046
6247
|
// final output
|
6047
6248
|
cur = inpL;
|
6249
|
+
cb(cur, "result_embd", -1);
|
6048
6250
|
|
6049
6251
|
// pooling layer
|
6050
|
-
|
6051
|
-
|
6052
|
-
|
6053
|
-
|
6054
|
-
|
6055
|
-
|
6252
|
+
switch (pooling_type) {
|
6253
|
+
case LLAMA_POOLING_TYPE_NONE:
|
6254
|
+
{
|
6255
|
+
// nop
|
6256
|
+
} break;
|
6257
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
6258
|
+
{
|
6259
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6260
|
+
cb(cur, "result_embd_pooled", -1);
|
6261
|
+
} break;
|
6262
|
+
case LLAMA_POOLING_TYPE_CLS:
|
6263
|
+
{
|
6264
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6265
|
+
cb(cur, "result_embd_pooled", -1);
|
6266
|
+
} break;
|
6267
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
6268
|
+
{
|
6269
|
+
GGML_ASSERT(false && "Invalid pooling type");
|
6270
|
+
} break;
|
6056
6271
|
}
|
6057
|
-
cb(cur, "result_embd", -1);
|
6058
6272
|
|
6059
6273
|
ggml_build_forward_expand(gf, cur);
|
6060
6274
|
|
@@ -6284,11 +6498,6 @@ struct llm_build_context {
|
|
6284
6498
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6285
6499
|
cb(KQ_mask, "KQ_mask", -1);
|
6286
6500
|
|
6287
|
-
// shift the entire K-cache if needed
|
6288
|
-
if (do_rope_shift) {
|
6289
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6290
|
-
}
|
6291
|
-
|
6292
6501
|
for (int il = 0; il < n_layer; ++il) {
|
6293
6502
|
struct ggml_tensor * inpSA = inpL;
|
6294
6503
|
|
@@ -6325,14 +6534,14 @@ struct llm_build_context {
|
|
6325
6534
|
|
6326
6535
|
Qcur = ggml_rope_custom(
|
6327
6536
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6328
|
-
|
6537
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6329
6538
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6330
6539
|
);
|
6331
6540
|
cb(Qcur, "Qcur", il);
|
6332
6541
|
|
6333
6542
|
Kcur = ggml_rope_custom(
|
6334
6543
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6335
|
-
|
6544
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6336
6545
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6337
6546
|
);
|
6338
6547
|
cb(Kcur, "Kcur", il);
|
@@ -6407,11 +6616,6 @@ struct llm_build_context {
|
|
6407
6616
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6408
6617
|
cb(KQ_mask, "KQ_mask", -1);
|
6409
6618
|
|
6410
|
-
// shift the entire K-cache if needed
|
6411
|
-
if (do_rope_shift) {
|
6412
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6413
|
-
}
|
6414
|
-
|
6415
6619
|
for (int il = 0; il < n_layer; ++il) {
|
6416
6620
|
struct ggml_tensor * inpSA = inpL;
|
6417
6621
|
|
@@ -6441,13 +6645,13 @@ struct llm_build_context {
|
|
6441
6645
|
|
6442
6646
|
// using mode = 2 for neox mode
|
6443
6647
|
Qcur = ggml_rope_custom(
|
6444
|
-
ctx0, Qcur, inp_pos,
|
6648
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6445
6649
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6446
6650
|
);
|
6447
6651
|
cb(Qcur, "Qcur", il);
|
6448
6652
|
|
6449
6653
|
Kcur = ggml_rope_custom(
|
6450
|
-
ctx0, Kcur, inp_pos,
|
6654
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6451
6655
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6452
6656
|
);
|
6453
6657
|
cb(Kcur, "Kcur", il);
|
@@ -6521,11 +6725,6 @@ struct llm_build_context {
|
|
6521
6725
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6522
6726
|
cb(KQ_mask, "KQ_mask", -1);
|
6523
6727
|
|
6524
|
-
// shift the entire K-cache if needed
|
6525
|
-
if (do_rope_shift) {
|
6526
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6527
|
-
}
|
6528
|
-
|
6529
6728
|
for (int il = 0; il < n_layer; ++il) {
|
6530
6729
|
struct ggml_tensor * inpSA = inpL;
|
6531
6730
|
|
@@ -6561,14 +6760,14 @@ struct llm_build_context {
|
|
6561
6760
|
|
6562
6761
|
Qcur = ggml_rope_custom(
|
6563
6762
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6564
|
-
|
6763
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6565
6764
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6566
6765
|
);
|
6567
6766
|
cb(Qcur, "Qcur", il);
|
6568
6767
|
|
6569
6768
|
Kcur = ggml_rope_custom(
|
6570
6769
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6571
|
-
|
6770
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6572
6771
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6573
6772
|
);
|
6574
6773
|
cb(Kcur, "Kcur", il);
|
@@ -6642,11 +6841,6 @@ struct llm_build_context {
|
|
6642
6841
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6643
6842
|
cb(KQ_mask, "KQ_mask", -1);
|
6644
6843
|
|
6645
|
-
// shift the entire K-cache if needed
|
6646
|
-
if (do_rope_shift) {
|
6647
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6648
|
-
}
|
6649
|
-
|
6650
6844
|
for (int il = 0; il < n_layer; ++il) {
|
6651
6845
|
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
6652
6846
|
model.layers[il].attn_norm,
|
@@ -6684,7 +6878,7 @@ struct llm_build_context {
|
|
6684
6878
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
6685
6879
|
|
6686
6880
|
Qcur = ggml_rope_custom(
|
6687
|
-
ctx0, Qcur, inp_pos,
|
6881
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6688
6882
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6689
6883
|
);
|
6690
6884
|
cb(Qcur, "Qcur", il);
|
@@ -6695,7 +6889,7 @@ struct llm_build_context {
|
|
6695
6889
|
cb(Qcur, "Qcur", il);
|
6696
6890
|
|
6697
6891
|
Kcur = ggml_rope_custom(
|
6698
|
-
ctx0, Kcur, inp_pos,
|
6892
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6699
6893
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6700
6894
|
);
|
6701
6895
|
cb(Kcur, "Kcur", il);
|
@@ -6764,11 +6958,6 @@ struct llm_build_context {
|
|
6764
6958
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6765
6959
|
cb(KQ_mask, "KQ_mask", -1);
|
6766
6960
|
|
6767
|
-
// shift the entire K-cache if needed
|
6768
|
-
if (do_rope_shift) {
|
6769
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6770
|
-
}
|
6771
|
-
|
6772
6961
|
for (int il = 0; il < n_layer; ++il) {
|
6773
6962
|
|
6774
6963
|
// norm
|
@@ -6792,14 +6981,14 @@ struct llm_build_context {
|
|
6792
6981
|
cb(Vcur, "Vcur", il);
|
6793
6982
|
|
6794
6983
|
Qcur = ggml_rope_custom(
|
6795
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur,
|
6796
|
-
n_embd_head,
|
6984
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
6985
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6797
6986
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
6798
6987
|
cb(Qcur, "Qcur", il);
|
6799
6988
|
|
6800
6989
|
Kcur = ggml_rope_custom(
|
6801
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur,
|
6802
|
-
n_embd_head,
|
6990
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
6991
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6803
6992
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
6804
6993
|
cb(Kcur, "Kcur", il);
|
6805
6994
|
|
@@ -6969,11 +7158,6 @@ struct llm_build_context {
|
|
6969
7158
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6970
7159
|
cb(KQ_mask, "KQ_mask", -1);
|
6971
7160
|
|
6972
|
-
// shift the entire K-cache if needed
|
6973
|
-
if (do_rope_shift) {
|
6974
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6975
|
-
}
|
6976
|
-
|
6977
7161
|
for (int il = 0; il < n_layer; ++il) {
|
6978
7162
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
6979
7163
|
model.layers[il].attn_norm,
|
@@ -6999,14 +7183,14 @@ struct llm_build_context {
|
|
6999
7183
|
|
7000
7184
|
struct ggml_tensor * Qcur = ggml_rope_custom(
|
7001
7185
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
7002
|
-
|
7186
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7003
7187
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7004
7188
|
);
|
7005
7189
|
cb(Qcur, "Qcur", il);
|
7006
7190
|
|
7007
7191
|
struct ggml_tensor * Kcur = ggml_rope_custom(
|
7008
7192
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7009
|
-
|
7193
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7010
7194
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7011
7195
|
);
|
7012
7196
|
cb(Kcur, "Kcur", il);
|
@@ -7077,11 +7261,6 @@ struct llm_build_context {
|
|
7077
7261
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7078
7262
|
cb(KQ_mask, "KQ_mask", -1);
|
7079
7263
|
|
7080
|
-
// shift the entire K-cache if needed
|
7081
|
-
if (do_rope_shift) {
|
7082
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7083
|
-
}
|
7084
|
-
|
7085
7264
|
for (int il = 0; il < n_layer; ++il) {
|
7086
7265
|
struct ggml_tensor * inpSA = inpL;
|
7087
7266
|
|
@@ -7117,14 +7296,14 @@ struct llm_build_context {
|
|
7117
7296
|
|
7118
7297
|
Qcur = ggml_rope_custom(
|
7119
7298
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7120
|
-
|
7299
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7121
7300
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7122
7301
|
);
|
7123
7302
|
cb(Qcur, "Qcur", il);
|
7124
7303
|
|
7125
7304
|
Kcur = ggml_rope_custom(
|
7126
7305
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7127
|
-
|
7306
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7128
7307
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7129
7308
|
);
|
7130
7309
|
cb(Kcur, "Kcur", il);
|
@@ -7196,11 +7375,6 @@ struct llm_build_context {
|
|
7196
7375
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7197
7376
|
cb(KQ_mask, "KQ_mask", -1);
|
7198
7377
|
|
7199
|
-
// shift the entire K-cache if needed
|
7200
|
-
if (do_rope_shift) {
|
7201
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7202
|
-
}
|
7203
|
-
|
7204
7378
|
for (int il = 0; il < n_layer; ++il) {
|
7205
7379
|
struct ggml_tensor * inpSA = inpL;
|
7206
7380
|
|
@@ -7236,14 +7410,14 @@ struct llm_build_context {
|
|
7236
7410
|
|
7237
7411
|
Qcur = ggml_rope_custom(
|
7238
7412
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7239
|
-
|
7413
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7240
7414
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7241
7415
|
);
|
7242
7416
|
cb(Qcur, "Qcur", il);
|
7243
7417
|
|
7244
7418
|
Kcur = ggml_rope_custom(
|
7245
7419
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7246
|
-
|
7420
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7247
7421
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7248
7422
|
);
|
7249
7423
|
cb(Kcur, "Kcur", il);
|
@@ -7328,11 +7502,6 @@ struct llm_build_context {
|
|
7328
7502
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7329
7503
|
cb(KQ_mask, "KQ_mask", -1);
|
7330
7504
|
|
7331
|
-
// shift the entire K-cache if needed
|
7332
|
-
if (do_rope_shift) {
|
7333
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7334
|
-
}
|
7335
|
-
|
7336
7505
|
for (int il = 0; il < n_layer; ++il) {
|
7337
7506
|
struct ggml_tensor * inpSA = inpL;
|
7338
7507
|
|
@@ -7368,14 +7537,14 @@ struct llm_build_context {
|
|
7368
7537
|
|
7369
7538
|
Qcur = ggml_rope_custom(
|
7370
7539
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7371
|
-
|
7540
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7372
7541
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7373
7542
|
);
|
7374
7543
|
cb(Qcur, "Qcur", il);
|
7375
7544
|
|
7376
7545
|
Kcur = ggml_rope_custom(
|
7377
7546
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7378
|
-
|
7547
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7379
7548
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7380
7549
|
);
|
7381
7550
|
cb(Kcur, "Kcur", il);
|
@@ -7464,11 +7633,6 @@ struct llm_build_context {
|
|
7464
7633
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7465
7634
|
cb(KQ_mask, "KQ_mask", -1);
|
7466
7635
|
|
7467
|
-
// shift the entire K-cache if needed
|
7468
|
-
if (do_rope_shift) {
|
7469
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7470
|
-
}
|
7471
|
-
|
7472
7636
|
for (int il = 0; il < n_layer; ++il) {
|
7473
7637
|
|
7474
7638
|
// norm
|
@@ -7491,7 +7655,7 @@ struct llm_build_context {
|
|
7491
7655
|
|
7492
7656
|
Qcur = ggml_rope_custom(
|
7493
7657
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
7494
|
-
n_embd_head_k,
|
7658
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7495
7659
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7496
7660
|
cb(Qcur, "Qcur", il);
|
7497
7661
|
|
@@ -7500,7 +7664,7 @@ struct llm_build_context {
|
|
7500
7664
|
|
7501
7665
|
Kcur = ggml_rope_custom(
|
7502
7666
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
7503
|
-
n_embd_head_k,
|
7667
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7504
7668
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7505
7669
|
cb(Kcur, "Kcur", il);
|
7506
7670
|
|
@@ -7551,33 +7715,181 @@ struct llm_build_context {
|
|
7551
7715
|
|
7552
7716
|
return gf;
|
7553
7717
|
}
|
7554
|
-
};
|
7555
|
-
|
7556
|
-
static struct ggml_cgraph * llama_build_graph(
|
7557
|
-
llama_context & lctx,
|
7558
|
-
const llama_batch & batch,
|
7559
|
-
bool worst_case) {
|
7560
|
-
const auto & model = lctx.model;
|
7561
7718
|
|
7562
|
-
|
7563
|
-
|
7564
|
-
if (il >= 0) {
|
7565
|
-
ggml_format_name(cur, "%s-%d", name, il);
|
7566
|
-
} else {
|
7567
|
-
ggml_set_name(cur, name);
|
7568
|
-
}
|
7719
|
+
struct ggml_cgraph * build_starcoder2() {
|
7720
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7569
7721
|
|
7570
|
-
|
7571
|
-
|
7572
|
-
|
7573
|
-
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
7574
|
-
}
|
7575
|
-
}
|
7576
|
-
};
|
7722
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7723
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7724
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
7577
7725
|
|
7578
|
-
|
7726
|
+
struct ggml_tensor * cur;
|
7727
|
+
struct ggml_tensor * inpL;
|
7579
7728
|
|
7580
|
-
|
7729
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
7730
|
+
cb(inpL, "inp_embd", -1);
|
7731
|
+
|
7732
|
+
// inp_pos - contains the positions
|
7733
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
7734
|
+
cb(inp_pos, "inp_pos", -1);
|
7735
|
+
|
7736
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7737
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7738
|
+
cb(KQ_mask, "KQ_mask", -1);
|
7739
|
+
|
7740
|
+
for (int il = 0; il < n_layer; ++il) {
|
7741
|
+
struct ggml_tensor * inpSA = inpL;
|
7742
|
+
|
7743
|
+
// norm
|
7744
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7745
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
7746
|
+
LLM_NORM, cb, il);
|
7747
|
+
cb(cur, "attn_norm", il);
|
7748
|
+
|
7749
|
+
// self-attention
|
7750
|
+
{
|
7751
|
+
// compute Q and K and RoPE them
|
7752
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
7753
|
+
cb(Qcur, "Qcur", il);
|
7754
|
+
if (model.layers[il].bq) {
|
7755
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
7756
|
+
cb(Qcur, "Qcur", il);
|
7757
|
+
}
|
7758
|
+
|
7759
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
7760
|
+
cb(Kcur, "Kcur", il);
|
7761
|
+
if (model.layers[il].bk) {
|
7762
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
7763
|
+
cb(Kcur, "Kcur", il);
|
7764
|
+
}
|
7765
|
+
|
7766
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7767
|
+
cb(Vcur, "Vcur", il);
|
7768
|
+
if (model.layers[il].bv) {
|
7769
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
7770
|
+
cb(Vcur, "Vcur", il);
|
7771
|
+
}
|
7772
|
+
|
7773
|
+
Qcur = ggml_rope_custom(
|
7774
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7775
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7776
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7777
|
+
);
|
7778
|
+
cb(Qcur, "Qcur", il);
|
7779
|
+
|
7780
|
+
Kcur = ggml_rope_custom(
|
7781
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7782
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7783
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7784
|
+
);
|
7785
|
+
cb(Kcur, "Kcur", il);
|
7786
|
+
|
7787
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7788
|
+
model.layers[il].wo, model.layers[il].bo,
|
7789
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7790
|
+
cb(cur, "kqv_out", il);
|
7791
|
+
}
|
7792
|
+
|
7793
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7794
|
+
cb(ffn_inp, "ffn_inp", il);
|
7795
|
+
|
7796
|
+
// feed-forward network
|
7797
|
+
|
7798
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7799
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
7800
|
+
LLM_NORM, cb, il);
|
7801
|
+
cb(cur, "ffn_norm", il);
|
7802
|
+
|
7803
|
+
cur = llm_build_ffn(ctx0, cur,
|
7804
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
7805
|
+
NULL, NULL,
|
7806
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
7807
|
+
NULL,
|
7808
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
7809
|
+
cb(cur, "ffn_out", il);
|
7810
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7811
|
+
cb(cur, "l_out", il);
|
7812
|
+
|
7813
|
+
// input for next layer
|
7814
|
+
inpL = cur;
|
7815
|
+
}
|
7816
|
+
|
7817
|
+
cur = inpL;
|
7818
|
+
|
7819
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7820
|
+
model.output_norm, model.output_norm_b,
|
7821
|
+
LLM_NORM, cb, -1);
|
7822
|
+
cb(cur, "result_norm", -1);
|
7823
|
+
|
7824
|
+
// lm_head
|
7825
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7826
|
+
cb(cur, "result_output", -1);
|
7827
|
+
|
7828
|
+
ggml_build_forward_expand(gf, cur);
|
7829
|
+
|
7830
|
+
return gf;
|
7831
|
+
}
|
7832
|
+
};
|
7833
|
+
|
7834
|
+
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
7835
|
+
llama_batch dummy;
|
7836
|
+
dummy.n_tokens = 0;
|
7837
|
+
|
7838
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
7839
|
+
|
7840
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
7841
|
+
|
7842
|
+
llm.init();
|
7843
|
+
|
7844
|
+
struct ggml_cgraph * result = llm.build_defrag(ids);
|
7845
|
+
|
7846
|
+
llm.free();
|
7847
|
+
|
7848
|
+
return result;
|
7849
|
+
}
|
7850
|
+
|
7851
|
+
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
7852
|
+
llama_batch dummy;
|
7853
|
+
dummy.n_tokens = 0;
|
7854
|
+
|
7855
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
7856
|
+
|
7857
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
7858
|
+
|
7859
|
+
llm.init();
|
7860
|
+
|
7861
|
+
struct ggml_cgraph * result = llm.build_k_shift();
|
7862
|
+
|
7863
|
+
llm.free();
|
7864
|
+
|
7865
|
+
return result;
|
7866
|
+
}
|
7867
|
+
|
7868
|
+
static struct ggml_cgraph * llama_build_graph(
|
7869
|
+
llama_context & lctx,
|
7870
|
+
const llama_batch & batch,
|
7871
|
+
bool worst_case) {
|
7872
|
+
const auto & model = lctx.model;
|
7873
|
+
|
7874
|
+
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
7875
|
+
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
7876
|
+
if (il >= 0) {
|
7877
|
+
ggml_format_name(cur, "%s-%d", name, il);
|
7878
|
+
} else {
|
7879
|
+
ggml_set_name(cur, name);
|
7880
|
+
}
|
7881
|
+
|
7882
|
+
if (!lctx.cparams.offload_kqv) {
|
7883
|
+
if (strcmp(name, "kqv_merged_cont") == 0) {
|
7884
|
+
// all nodes between the KV store and the attention output are run on the CPU
|
7885
|
+
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
7886
|
+
}
|
7887
|
+
}
|
7888
|
+
};
|
7889
|
+
|
7890
|
+
struct ggml_cgraph * result = NULL;
|
7891
|
+
|
7892
|
+
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
7581
7893
|
|
7582
7894
|
llm.init();
|
7583
7895
|
|
@@ -7663,6 +7975,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7663
7975
|
{
|
7664
7976
|
result = llm.build_gemma();
|
7665
7977
|
} break;
|
7978
|
+
case LLM_ARCH_STARCODER2:
|
7979
|
+
{
|
7980
|
+
result = llm.build_starcoder2();
|
7981
|
+
} break;
|
7666
7982
|
default:
|
7667
7983
|
GGML_ASSERT(false);
|
7668
7984
|
}
|
@@ -7672,6 +7988,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7672
7988
|
return result;
|
7673
7989
|
}
|
7674
7990
|
|
7991
|
+
static void llama_set_k_shift(llama_context & lctx) {
|
7992
|
+
const auto & cparams = lctx.cparams;
|
7993
|
+
|
7994
|
+
const int64_t n_ctx = cparams.n_ctx;
|
7995
|
+
|
7996
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7997
|
+
|
7998
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7999
|
+
|
8000
|
+
for (int i = 0; i < n_ctx; ++i) {
|
8001
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
8002
|
+
}
|
8003
|
+
}
|
8004
|
+
|
7675
8005
|
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7676
8006
|
//
|
7677
8007
|
// set input data
|
@@ -7700,7 +8030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7700
8030
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7701
8031
|
}
|
7702
8032
|
|
7703
|
-
{
|
8033
|
+
if (hparams.causal_attn) {
|
7704
8034
|
const int64_t n_kv = kv_self.n;
|
7705
8035
|
const int64_t n_tokens = batch.n_tokens;
|
7706
8036
|
|
@@ -7715,16 +8045,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7715
8045
|
|
7716
8046
|
for (int i = 0; i < n_kv; ++i) {
|
7717
8047
|
float f;
|
7718
|
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
7719
|
-
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
8048
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
7720
8049
|
f = -INFINITY;
|
7721
8050
|
} else {
|
7722
|
-
f = 0;
|
8051
|
+
f = 0.0f;
|
7723
8052
|
}
|
7724
8053
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7725
8054
|
}
|
7726
8055
|
}
|
7727
8056
|
}
|
8057
|
+
} else {
|
8058
|
+
// non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
|
8059
|
+
const int64_t n_tokens = batch.n_tokens;
|
8060
|
+
|
8061
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
8062
|
+
|
8063
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
8064
|
+
|
8065
|
+
for (int h = 0; h < 1; ++h) {
|
8066
|
+
for (int j = 0; j < n_tokens; ++j) {
|
8067
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
8068
|
+
|
8069
|
+
for (int i = 0; i < n_tokens; ++i) {
|
8070
|
+
float f = -INFINITY;
|
8071
|
+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
8072
|
+
if (batch.seq_id[i][s] == seq_id) {
|
8073
|
+
f = 0.0f;
|
8074
|
+
break;
|
8075
|
+
}
|
8076
|
+
}
|
8077
|
+
|
8078
|
+
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
|
8079
|
+
}
|
8080
|
+
}
|
8081
|
+
}
|
7728
8082
|
}
|
7729
8083
|
|
7730
8084
|
if (hparams.need_kq_pos) {
|
@@ -7739,29 +8093,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7739
8093
|
}
|
7740
8094
|
}
|
7741
8095
|
|
7742
|
-
if (
|
7743
|
-
const int64_t n_ctx = cparams.n_ctx;
|
7744
|
-
|
7745
|
-
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7746
|
-
|
7747
|
-
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7748
|
-
|
7749
|
-
for (int i = 0; i < n_ctx; ++i) {
|
7750
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
7751
|
-
}
|
7752
|
-
}
|
7753
|
-
|
7754
|
-
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
8096
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
7755
8097
|
const int64_t n_tokens = batch.n_tokens;
|
7756
8098
|
|
7757
8099
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
7758
|
-
float * data = (float *) lctx.inp_mean->data;
|
7759
8100
|
|
8101
|
+
float * data = (float *) lctx.inp_mean->data;
|
7760
8102
|
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
7761
8103
|
|
7762
8104
|
std::vector<uint64_t> sum(n_tokens, 0);
|
7763
8105
|
for (int i = 0; i < n_tokens; ++i) {
|
7764
8106
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
8107
|
+
|
8108
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
8109
|
+
|
7765
8110
|
sum[seq_id] += 1;
|
7766
8111
|
}
|
7767
8112
|
|
@@ -7779,15 +8124,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7779
8124
|
}
|
7780
8125
|
}
|
7781
8126
|
|
7782
|
-
if (cparams.
|
8127
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
7783
8128
|
const int64_t n_tokens = batch.n_tokens;
|
7784
8129
|
|
7785
8130
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
8131
|
+
|
7786
8132
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
8133
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
7787
8134
|
|
7788
8135
|
for (int i = 0; i < n_tokens; ++i) {
|
7789
8136
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7790
|
-
const llama_pos
|
8137
|
+
const llama_pos pos = batch.pos[i];
|
8138
|
+
|
8139
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
8140
|
+
|
7791
8141
|
if (pos == 0) {
|
7792
8142
|
data[seq_id] = i;
|
7793
8143
|
}
|
@@ -7795,6 +8145,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7795
8145
|
}
|
7796
8146
|
}
|
7797
8147
|
|
8148
|
+
static void llama_graph_compute(
|
8149
|
+
llama_context & lctx,
|
8150
|
+
ggml_cgraph * gf,
|
8151
|
+
int n_threads) {
|
8152
|
+
#ifdef GGML_USE_MPI
|
8153
|
+
const int64_t n_layer = lctx.model.hparams.n_layer;
|
8154
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
8155
|
+
#endif
|
8156
|
+
|
8157
|
+
#ifdef GGML_USE_METAL
|
8158
|
+
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
8159
|
+
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
8160
|
+
}
|
8161
|
+
#endif
|
8162
|
+
|
8163
|
+
if (lctx.backend_cpu != nullptr) {
|
8164
|
+
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
8165
|
+
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
8166
|
+
}
|
8167
|
+
|
8168
|
+
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
8169
|
+
|
8170
|
+
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
8171
|
+
|
8172
|
+
#ifdef GGML_USE_MPI
|
8173
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
8174
|
+
#endif
|
8175
|
+
}
|
8176
|
+
|
7798
8177
|
// decode a batch of tokens by evaluating the transformer
|
7799
8178
|
//
|
7800
8179
|
// - lctx: llama context
|
@@ -7821,9 +8200,9 @@ static int llama_decode_internal(
|
|
7821
8200
|
const auto n_batch = cparams.n_batch;
|
7822
8201
|
|
7823
8202
|
GGML_ASSERT(n_tokens <= n_batch);
|
8203
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
7824
8204
|
|
7825
8205
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
7826
|
-
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
7827
8206
|
|
7828
8207
|
const int64_t t_start_us = ggml_time_us();
|
7829
8208
|
|
@@ -7872,21 +8251,26 @@ static int llama_decode_internal(
|
|
7872
8251
|
batch.seq_id = seq_id_arr.data();
|
7873
8252
|
}
|
7874
8253
|
|
7875
|
-
//
|
7876
|
-
|
7877
|
-
|
7878
|
-
kv_self.head = 0;
|
7879
|
-
}
|
8254
|
+
// non-causal masks do not use the KV cache
|
8255
|
+
if (hparams.causal_attn) {
|
8256
|
+
llama_kv_cache_update(&lctx);
|
7880
8257
|
|
7881
|
-
|
7882
|
-
|
7883
|
-
|
8258
|
+
// if we have enough unused cells before the current head ->
|
8259
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
8260
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
8261
|
+
kv_self.head = 0;
|
8262
|
+
}
|
7884
8263
|
|
7885
|
-
|
7886
|
-
|
7887
|
-
|
7888
|
-
|
7889
|
-
|
8264
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
8265
|
+
return 1;
|
8266
|
+
}
|
8267
|
+
|
8268
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
8269
|
+
// after enough generations, the benefit from this heuristic disappears
|
8270
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
8271
|
+
kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
8272
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
8273
|
+
}
|
7890
8274
|
|
7891
8275
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
7892
8276
|
|
@@ -7896,19 +8280,26 @@ static int llama_decode_internal(
|
|
7896
8280
|
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
7897
8281
|
|
7898
8282
|
// the output is always the last tensor in the graph
|
7899
|
-
struct ggml_tensor * res
|
7900
|
-
struct ggml_tensor *
|
7901
|
-
|
7902
|
-
|
7903
|
-
|
7904
|
-
|
7905
|
-
|
7906
|
-
|
7907
|
-
|
7908
|
-
|
7909
|
-
res = nullptr;
|
8283
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
8284
|
+
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
8285
|
+
|
8286
|
+
if (!hparams.causal_attn) {
|
8287
|
+
res = nullptr; // do not extract logits for embedding models such as BERT
|
8288
|
+
|
8289
|
+
// token or sequence embeddings
|
8290
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
8291
|
+
|
8292
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
7910
8293
|
} else {
|
7911
|
-
|
8294
|
+
if (strcmp(res->name, "result_output") == 0) {
|
8295
|
+
// the token embeddings could be the second to last tensor, or the third to last tensor
|
8296
|
+
if (strcmp(embd->name, "result_norm") != 0) {
|
8297
|
+
embd = gf->nodes[gf->n_nodes - 3];
|
8298
|
+
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
|
8299
|
+
}
|
8300
|
+
} else {
|
8301
|
+
GGML_ASSERT(false && "missing result_output tensor");
|
8302
|
+
}
|
7912
8303
|
}
|
7913
8304
|
|
7914
8305
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -7924,40 +8315,12 @@ static int llama_decode_internal(
|
|
7924
8315
|
n_threads = std::min(4, n_threads);
|
7925
8316
|
}
|
7926
8317
|
|
7927
|
-
#ifdef GGML_USE_MPI
|
7928
|
-
const int64_t n_layer = hparams.n_layer;
|
7929
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
7930
|
-
#endif
|
7931
|
-
|
7932
|
-
#ifdef GGML_USE_METAL
|
7933
|
-
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
7934
|
-
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
7935
|
-
}
|
7936
|
-
#endif
|
7937
|
-
|
7938
|
-
if (lctx.backend_cpu != nullptr) {
|
7939
|
-
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7940
|
-
}
|
7941
|
-
|
7942
8318
|
llama_set_inputs(lctx, batch);
|
7943
8319
|
|
7944
|
-
|
7945
|
-
|
7946
|
-
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
7947
|
-
|
7948
|
-
#ifdef GGML_USE_MPI
|
7949
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
7950
|
-
#endif
|
8320
|
+
llama_graph_compute(lctx, gf, n_threads);
|
7951
8321
|
|
7952
8322
|
// update the kv ring buffer
|
7953
8323
|
{
|
7954
|
-
if (kv_self.has_shift) {
|
7955
|
-
kv_self.has_shift = false;
|
7956
|
-
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
7957
|
-
kv_self.cells[i].delta = 0;
|
7958
|
-
}
|
7959
|
-
}
|
7960
|
-
|
7961
8324
|
kv_self.head += n_tokens;
|
7962
8325
|
|
7963
8326
|
// Ensure kv cache head points to a valid index.
|
@@ -7966,6 +8329,18 @@ static int llama_decode_internal(
|
|
7966
8329
|
}
|
7967
8330
|
}
|
7968
8331
|
|
8332
|
+
// decide if we need to defrag the kv cache
|
8333
|
+
if (cparams.defrag_thold >= 0.0f) {
|
8334
|
+
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
|
8335
|
+
|
8336
|
+
// queue defragmentation for next llama_kv_cache_update
|
8337
|
+
if (fragmentation > cparams.defrag_thold) {
|
8338
|
+
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
|
8339
|
+
|
8340
|
+
llama_kv_cache_defrag(kv_self);
|
8341
|
+
}
|
8342
|
+
}
|
8343
|
+
|
7969
8344
|
#ifdef GGML_PERF
|
7970
8345
|
// print timing information per ggml operation (for debugging purposes)
|
7971
8346
|
// requires GGML_PERF to be defined
|
@@ -7991,66 +8366,341 @@ static int llama_decode_internal(
|
|
7991
8366
|
logits_out.clear();
|
7992
8367
|
#endif
|
7993
8368
|
|
7994
|
-
ggml_backend_t
|
7995
|
-
GGML_ASSERT(
|
8369
|
+
ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
8370
|
+
GGML_ASSERT(backend_res != nullptr);
|
8371
|
+
|
7996
8372
|
if (batch.logits) {
|
7997
8373
|
logits_out.resize(n_vocab * n_tokens);
|
7998
8374
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
7999
8375
|
if (batch.logits[i] == 0) {
|
8000
8376
|
continue;
|
8001
8377
|
}
|
8002
|
-
ggml_backend_tensor_get_async(
|
8378
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
8003
8379
|
#ifndef NDEBUG
|
8004
8380
|
logits_valid[i] = true;
|
8005
8381
|
#endif
|
8006
8382
|
}
|
8007
8383
|
} else if (lctx.logits_all) {
|
8008
8384
|
logits_out.resize(n_vocab * n_tokens);
|
8009
|
-
ggml_backend_tensor_get_async(
|
8385
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
8010
8386
|
#ifndef NDEBUG
|
8011
8387
|
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
8012
8388
|
#endif
|
8013
8389
|
} else {
|
8014
8390
|
logits_out.resize(n_vocab);
|
8015
|
-
ggml_backend_tensor_get_async(
|
8391
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
8016
8392
|
#ifndef NDEBUG
|
8017
8393
|
logits_valid[0] = true;
|
8018
8394
|
#endif
|
8019
8395
|
}
|
8020
|
-
ggml_backend_synchronize(
|
8021
|
-
}
|
8396
|
+
ggml_backend_synchronize(backend_res);
|
8397
|
+
}
|
8398
|
+
|
8399
|
+
// extract embeddings
|
8400
|
+
if (cparams.embeddings && embd) {
|
8401
|
+
ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
|
8402
|
+
GGML_ASSERT(backend_embd != nullptr);
|
8403
|
+
|
8404
|
+
switch (cparams.pooling_type) {
|
8405
|
+
case LLAMA_POOLING_TYPE_NONE:
|
8406
|
+
{
|
8407
|
+
// extract token embeddings
|
8408
|
+
auto & embd_out = lctx.embd;
|
8409
|
+
|
8410
|
+
if (batch.logits) {
|
8411
|
+
embd_out.resize(n_embd * n_tokens);
|
8412
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8413
|
+
if (batch.logits[i] == 0) {
|
8414
|
+
continue;
|
8415
|
+
}
|
8416
|
+
|
8417
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
8418
|
+
}
|
8419
|
+
}
|
8420
|
+
} break;
|
8421
|
+
case LLAMA_POOLING_TYPE_CLS:
|
8422
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
8423
|
+
{
|
8424
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
8425
|
+
|
8426
|
+
// extract sequence embeddings
|
8427
|
+
auto & embd_seq_out = lctx.embd_seq;
|
8428
|
+
embd_seq_out.clear();
|
8429
|
+
|
8430
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8431
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
8432
|
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
8433
|
+
continue;
|
8434
|
+
}
|
8435
|
+
embd_seq_out[seq_id].resize(n_embd);
|
8436
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
8437
|
+
}
|
8438
|
+
} break;
|
8439
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
8440
|
+
{
|
8441
|
+
GGML_ASSERT(false && "unknown pooling type");
|
8442
|
+
} break;
|
8443
|
+
}
|
8444
|
+
ggml_backend_synchronize(backend_embd);
|
8445
|
+
}
|
8446
|
+
|
8447
|
+
// measure the performance only for the single-token evals
|
8448
|
+
if (n_tokens == 1) {
|
8449
|
+
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
8450
|
+
lctx.n_eval++;
|
8451
|
+
}
|
8452
|
+
else if (n_tokens > 1) {
|
8453
|
+
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
8454
|
+
lctx.n_p_eval += n_tokens;
|
8455
|
+
}
|
8456
|
+
|
8457
|
+
// get a more accurate load time, upon first eval
|
8458
|
+
// TODO: fix this
|
8459
|
+
if (!lctx.has_evaluated_once) {
|
8460
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
8461
|
+
lctx.has_evaluated_once = true;
|
8462
|
+
}
|
8463
|
+
|
8464
|
+
return 0;
|
8465
|
+
}
|
8466
|
+
|
8467
|
+
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
8468
|
+
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
8469
|
+
auto & kv_self = lctx.kv_self;
|
8470
|
+
|
8471
|
+
const auto & hparams = lctx.model.hparams;
|
8472
|
+
|
8473
|
+
const uint32_t n_layer = hparams.n_layer;
|
8474
|
+
|
8475
|
+
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
|
8476
|
+
const uint32_t n_used = kv_self.used;
|
8477
|
+
|
8478
|
+
assert(n_used <= n_kv);
|
8479
|
+
|
8480
|
+
//const int64_t t_start = ggml_time_us();
|
8481
|
+
|
8482
|
+
// number of cells moved
|
8483
|
+
uint32_t n_moves = 0;
|
8484
|
+
|
8485
|
+
// determine which KV cells to move where
|
8486
|
+
//
|
8487
|
+
// cell i moves to ids[i]
|
8488
|
+
//
|
8489
|
+
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
8490
|
+
//
|
8491
|
+
std::vector<uint32_t> ids(n_kv, n_kv);
|
8492
|
+
|
8493
|
+
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
8494
|
+
const auto & cell0 = kv_self.cells[i0];
|
8495
|
+
|
8496
|
+
if (!cell0.is_empty()) {
|
8497
|
+
ids[i0] = i0;
|
8498
|
+
|
8499
|
+
continue;
|
8500
|
+
}
|
8501
|
+
|
8502
|
+
// found a hole - fill it with data from the end of the cache
|
8503
|
+
|
8504
|
+
uint32_t nh = 1;
|
8505
|
+
|
8506
|
+
// determine the size of the hole
|
8507
|
+
while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
|
8508
|
+
nh++;
|
8509
|
+
}
|
8510
|
+
|
8511
|
+
// each move requires 6*n_layer tensors (see build_defrag)
|
8512
|
+
// - source view, destination view, copy operation
|
8513
|
+
// - x2 for keys and values
|
8514
|
+
//
|
8515
|
+
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
8516
|
+
// the graph is too big, we cannot move more cells
|
8517
|
+
break;
|
8518
|
+
}
|
8519
|
+
|
8520
|
+
uint32_t nf = 0;
|
8521
|
+
uint32_t is = n_kv - 1;
|
8522
|
+
|
8523
|
+
// starting from the end, find nh non-empty cells
|
8524
|
+
for (; is > i0; --is) {
|
8525
|
+
const auto & cell1 = kv_self.cells[is];
|
8526
|
+
|
8527
|
+
if (cell1.is_empty() || ids[is] != n_kv) {
|
8528
|
+
continue;
|
8529
|
+
}
|
8530
|
+
|
8531
|
+
// non-empty cell which is not yet moved
|
8532
|
+
nf++;
|
8533
|
+
|
8534
|
+
if (nf == nh) {
|
8535
|
+
break;
|
8536
|
+
}
|
8537
|
+
}
|
8538
|
+
|
8539
|
+
// this can only happen if `n_used` is not accurate, which would be a bug
|
8540
|
+
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
8541
|
+
|
8542
|
+
nf = 0;
|
8543
|
+
|
8544
|
+
uint32_t i1 = is;
|
8545
|
+
|
8546
|
+
// are we moving a continuous block of memory?
|
8547
|
+
bool cont = false;
|
8548
|
+
|
8549
|
+
// go back and move the nf cells to the hole
|
8550
|
+
for (; i1 < n_kv; ++i1) {
|
8551
|
+
auto & cell1 = kv_self.cells[i1];
|
8552
|
+
|
8553
|
+
if (cell1.is_empty() || ids[i1] != n_kv) {
|
8554
|
+
cont = false;
|
8555
|
+
continue;
|
8556
|
+
}
|
8557
|
+
|
8558
|
+
// this cell goes to (i0 + nf)
|
8559
|
+
ids[i1] = i0 + nf;
|
8560
|
+
|
8561
|
+
// move the cell meta data
|
8562
|
+
kv_self.cells[i0 + nf] = cell1;
|
8563
|
+
|
8564
|
+
// clear the old cell and move the head there
|
8565
|
+
cell1 = llama_kv_cell();
|
8566
|
+
kv_self.head = n_used;
|
8567
|
+
|
8568
|
+
if (!cont) {
|
8569
|
+
n_moves++;
|
8570
|
+
cont = true;
|
8571
|
+
}
|
8572
|
+
|
8573
|
+
nf++;
|
8574
|
+
|
8575
|
+
if (nf == nh) {
|
8576
|
+
break;
|
8577
|
+
}
|
8578
|
+
}
|
8579
|
+
|
8580
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
8581
|
+
|
8582
|
+
i0 += nh - 1;
|
8583
|
+
}
|
8584
|
+
|
8585
|
+
if (n_moves == 0) {
|
8586
|
+
return;
|
8587
|
+
}
|
8588
|
+
|
8589
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
8590
|
+
|
8591
|
+
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
8592
|
+
|
8593
|
+
#if 0
|
8594
|
+
// CPU defrag
|
8595
|
+
//
|
8596
|
+
// TODO: optimizations are possible:
|
8597
|
+
// - multiple threads
|
8598
|
+
// - avoid copying to the host memory when already there
|
8599
|
+
//
|
8600
|
+
// likely not worth the effort, as we have ggml_graph based defrag
|
8601
|
+
//
|
8602
|
+
|
8603
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
8604
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
8605
|
+
|
8606
|
+
const uint32_t kv_size = kv_self.size;
|
8607
|
+
|
8608
|
+
std::vector<uint8_t> buf_k;
|
8609
|
+
std::vector<uint8_t> buf_v;
|
8610
|
+
|
8611
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
8612
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
8613
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
|
8614
|
+
|
8615
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
8616
|
+
const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
|
8617
|
+
|
8618
|
+
buf_k.resize(k_size);
|
8619
|
+
buf_v.resize(v_size);
|
8620
|
+
|
8621
|
+
ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
8622
|
+
ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
8623
|
+
|
8624
|
+
// batch move [i, i+nm) to [id, id+nm)
|
8625
|
+
// note: cells can move only to a lower index
|
8626
|
+
for (uint32_t i = 0; i < n_kv; ++i) {
|
8627
|
+
const uint32_t id = ids[i];
|
8628
|
+
|
8629
|
+
if (i == id || id == n_kv) {
|
8630
|
+
continue;
|
8631
|
+
}
|
8632
|
+
|
8633
|
+
uint32_t nm = 1;
|
8634
|
+
|
8635
|
+
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
8636
|
+
nm++;
|
8637
|
+
}
|
8638
|
+
|
8639
|
+
// move keys
|
8640
|
+
{
|
8641
|
+
const int64_t os = i*k_size_row;
|
8642
|
+
const int64_t od = id*k_size_row;
|
8643
|
+
|
8644
|
+
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
8645
|
+
}
|
8646
|
+
|
8647
|
+
// move values (note: they are transposed)
|
8648
|
+
{
|
8649
|
+
const int64_t os = i;
|
8650
|
+
const int64_t od = id;
|
8651
|
+
|
8652
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
8653
|
+
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
8654
|
+
}
|
8655
|
+
}
|
8656
|
+
|
8657
|
+
i += nm - 1;
|
8658
|
+
}
|
8659
|
+
|
8660
|
+
ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
8661
|
+
ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
8662
|
+
}
|
8663
|
+
#else
|
8664
|
+
// ggml_graph defrag
|
8665
|
+
|
8666
|
+
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
8667
|
+
|
8668
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
8669
|
+
#endif
|
8670
|
+
|
8671
|
+
//const int64_t t_end = ggml_time_us();
|
8672
|
+
|
8673
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
|
8674
|
+
}
|
8675
|
+
|
8676
|
+
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
8677
|
+
// apply K-shift if needed
|
8678
|
+
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
8679
|
+
llama_set_k_shift(lctx);
|
8680
|
+
|
8681
|
+
{
|
8682
|
+
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
8683
|
+
|
8684
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
8685
|
+
}
|
8022
8686
|
|
8023
|
-
|
8024
|
-
|
8025
|
-
auto & embedding_out = lctx.embedding;
|
8687
|
+
{
|
8688
|
+
auto & kv_self = lctx.kv_self;
|
8026
8689
|
|
8027
|
-
|
8028
|
-
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
8690
|
+
kv_self.has_shift = false;
|
8029
8691
|
|
8030
|
-
|
8031
|
-
|
8032
|
-
|
8033
|
-
|
8692
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
8693
|
+
kv_self.cells[i].delta = 0;
|
8694
|
+
}
|
8695
|
+
}
|
8034
8696
|
}
|
8035
8697
|
|
8036
|
-
//
|
8037
|
-
if (
|
8038
|
-
lctx
|
8039
|
-
lctx.n_eval++;
|
8040
|
-
}
|
8041
|
-
else if (n_tokens > 1) {
|
8042
|
-
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
8043
|
-
lctx.n_p_eval += n_tokens;
|
8044
|
-
}
|
8698
|
+
// defragment the KV cache if needed
|
8699
|
+
if (lctx.kv_self.do_defrag) {
|
8700
|
+
llama_kv_cache_defrag_internal(lctx);
|
8045
8701
|
|
8046
|
-
|
8047
|
-
// TODO: fix this
|
8048
|
-
if (!lctx.has_evaluated_once) {
|
8049
|
-
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
8050
|
-
lctx.has_evaluated_once = true;
|
8702
|
+
lctx.kv_self.do_defrag = false;
|
8051
8703
|
}
|
8052
|
-
|
8053
|
-
return 0;
|
8054
8704
|
}
|
8055
8705
|
|
8056
8706
|
//
|
@@ -8085,19 +8735,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
8085
8735
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
8086
8736
|
const auto& token_data = vocab.id_to_token.at(id);
|
8087
8737
|
switch (llama_vocab_get_type(vocab)) {
|
8088
|
-
|
8089
|
-
|
8090
|
-
|
8091
|
-
|
8092
|
-
|
8093
|
-
|
8094
|
-
|
8095
|
-
|
8096
|
-
|
8097
|
-
|
8098
|
-
|
8099
|
-
|
8100
|
-
|
8738
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
8739
|
+
auto buf = token_data.text.substr(3, 2);
|
8740
|
+
return strtol(buf.c_str(), NULL, 16);
|
8741
|
+
}
|
8742
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
8743
|
+
GGML_ASSERT(false);
|
8744
|
+
return unicode_to_bytes_bpe(token_data.text);
|
8745
|
+
}
|
8746
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
8747
|
+
GGML_ASSERT(false);
|
8748
|
+
}
|
8749
|
+
default:
|
8750
|
+
GGML_ASSERT(false);
|
8101
8751
|
}
|
8102
8752
|
}
|
8103
8753
|
|
@@ -8644,37 +9294,46 @@ struct llm_tokenizer_wpm {
|
|
8644
9294
|
}
|
8645
9295
|
|
8646
9296
|
std::vector<std::string> preprocess(const std::string & text) {
|
8647
|
-
|
8648
|
-
|
9297
|
+
// normalalization form D
|
9298
|
+
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
9299
|
+
std::vector<uint32_t> nfd_codepoints;
|
9300
|
+
for (uint32_t code : codepoints) {
|
9301
|
+
auto it = nfd_map.equal_range(code);
|
9302
|
+
if (it.first != it.second) {
|
9303
|
+
for (auto jt = it.first; jt != it.second; jt++) {
|
9304
|
+
nfd_codepoints.push_back(jt->second);
|
9305
|
+
}
|
9306
|
+
} else {
|
9307
|
+
nfd_codepoints.push_back(code);
|
9308
|
+
}
|
9309
|
+
}
|
8649
9310
|
|
8650
|
-
//
|
8651
|
-
//
|
8652
|
-
std::vector<std::string> words;
|
9311
|
+
// strip accents, strip control, uniformize whitespace,
|
9312
|
+
// to lowercase, pad chinese characters, pad punctuation
|
8653
9313
|
std::string new_str = "";
|
8654
|
-
|
8655
|
-
|
8656
|
-
|
8657
|
-
|
8658
|
-
|
8659
|
-
|
8660
|
-
|
8661
|
-
|
9314
|
+
for (uint32_t code : nfd_codepoints) {
|
9315
|
+
int type = codepoint_type(code);
|
9316
|
+
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
9317
|
+
continue;
|
9318
|
+
}
|
9319
|
+
code = to_lower(code);
|
9320
|
+
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
9321
|
+
code = ' ';
|
8662
9322
|
}
|
8663
|
-
|
9323
|
+
std::string s = codepoint_to_utf8(code);
|
9324
|
+
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
|
8664
9325
|
new_str += " ";
|
8665
|
-
new_str +=
|
9326
|
+
new_str += s;
|
8666
9327
|
new_str += " ";
|
8667
|
-
|
8668
|
-
|
8669
|
-
else {
|
8670
|
-
new_str += ori_str[i];
|
8671
|
-
i += 1;
|
9328
|
+
} else {
|
9329
|
+
new_str += s;
|
8672
9330
|
}
|
8673
9331
|
}
|
8674
9332
|
|
8675
9333
|
// split by whitespace
|
8676
9334
|
uint64_t l = 0;
|
8677
9335
|
uint64_t r = 0;
|
9336
|
+
std::vector<std::string> words;
|
8678
9337
|
while (r < new_str.size()) {
|
8679
9338
|
// if is whitespace
|
8680
9339
|
if (isspace(new_str[r])) {
|
@@ -8692,47 +9351,21 @@ struct llm_tokenizer_wpm {
|
|
8692
9351
|
return words;
|
8693
9352
|
}
|
8694
9353
|
|
8695
|
-
|
8696
|
-
|
8697
|
-
|
8698
|
-
|
8699
|
-
|
8700
|
-
if (c >= 'A' && c <= 'Z') {
|
8701
|
-
text2[i] = c - 'A' + 'a';
|
8702
|
-
}
|
9354
|
+
uint32_t to_lower(uint32_t code) {
|
9355
|
+
static const std::locale locale("en_US.UTF-8");
|
9356
|
+
#if defined(_WIN32)
|
9357
|
+
if (code > 0xFFFF) {
|
9358
|
+
return code;
|
8703
9359
|
}
|
8704
|
-
|
9360
|
+
#endif
|
9361
|
+
return std::tolower(wchar_t(code), locale);
|
8705
9362
|
}
|
8706
9363
|
|
8707
|
-
bool
|
8708
|
-
|
8709
|
-
|
8710
|
-
|
8711
|
-
|
8712
|
-
unsigned char ch = static_cast<unsigned char>(str[i]);
|
8713
|
-
if (ch <= 0x7f) {
|
8714
|
-
codepoint = ch;
|
8715
|
-
num_bytes = 1;
|
8716
|
-
} else if ((ch >> 5) == 0x06) {
|
8717
|
-
codepoint = ch & 0x1f;
|
8718
|
-
num_bytes = 2;
|
8719
|
-
} else if ((ch >> 4) == 0x0e) {
|
8720
|
-
codepoint = ch & 0x0f;
|
8721
|
-
num_bytes = 3;
|
8722
|
-
} else if ((ch >> 3) == 0x1e) {
|
8723
|
-
codepoint = ch & 0x07;
|
8724
|
-
num_bytes = 4;
|
8725
|
-
}
|
8726
|
-
for (int j = 1; j < num_bytes; ++j) {
|
8727
|
-
if (i + j >= len) {
|
8728
|
-
return false; // incomplete UTF-8 character
|
8729
|
-
}
|
8730
|
-
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
8731
|
-
if ((next_ch >> 6) != 0x02) {
|
8732
|
-
return false; // invalid trailing byte
|
8733
|
-
}
|
8734
|
-
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
8735
|
-
}
|
9364
|
+
bool is_ascii_punct(uint32_t code) {
|
9365
|
+
return code < 256 && ispunct(code);
|
9366
|
+
}
|
9367
|
+
|
9368
|
+
bool is_chinese_char(uint32_t codepoint) {
|
8736
9369
|
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
8737
9370
|
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
8738
9371
|
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
@@ -8748,41 +9381,6 @@ struct llm_tokenizer_wpm {
|
|
8748
9381
|
return false;
|
8749
9382
|
}
|
8750
9383
|
|
8751
|
-
std::string strip_accents(const std::string & input_string) {
|
8752
|
-
std::string resultString;
|
8753
|
-
std::map<std::string, char> accent_map = {
|
8754
|
-
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
8755
|
-
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
8756
|
-
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
8757
|
-
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
8758
|
-
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
8759
|
-
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
8760
|
-
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
8761
|
-
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
8762
|
-
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
8763
|
-
};
|
8764
|
-
|
8765
|
-
for (size_t i = 0; i < input_string.length();) {
|
8766
|
-
int len = utf8_len(input_string[i]);
|
8767
|
-
std::string curChar = input_string.substr(i, len);
|
8768
|
-
auto iter = accent_map.find(curChar);
|
8769
|
-
if (iter != accent_map.end()) {
|
8770
|
-
resultString += iter->second;
|
8771
|
-
} else {
|
8772
|
-
resultString += curChar;
|
8773
|
-
}
|
8774
|
-
i += len;
|
8775
|
-
}
|
8776
|
-
|
8777
|
-
return resultString;
|
8778
|
-
}
|
8779
|
-
|
8780
|
-
static size_t utf8_len(char src) {
|
8781
|
-
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
8782
|
-
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
8783
|
-
return lookup[highbits];
|
8784
|
-
}
|
8785
|
-
|
8786
9384
|
const llama_vocab & vocab;
|
8787
9385
|
};
|
8788
9386
|
|
@@ -9816,10 +10414,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
|
|
9816
10414
|
}
|
9817
10415
|
}
|
9818
10416
|
|
9819
|
-
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
9820
|
-
llama_sample_temp(ctx, candidates_p, temp);
|
9821
|
-
}
|
9822
|
-
|
9823
10417
|
void llama_sample_repetition_penalties(
|
9824
10418
|
struct llama_context * ctx,
|
9825
10419
|
llama_token_data_array * candidates,
|
@@ -9946,38 +10540,6 @@ void llama_sample_apply_guidance(
|
|
9946
10540
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9947
10541
|
}
|
9948
10542
|
|
9949
|
-
void llama_sample_classifier_free_guidance(
|
9950
|
-
struct llama_context * ctx,
|
9951
|
-
llama_token_data_array * candidates,
|
9952
|
-
struct llama_context * guidance_ctx,
|
9953
|
-
float scale) {
|
9954
|
-
GGML_ASSERT(ctx);
|
9955
|
-
int64_t t_start_sample_us;
|
9956
|
-
|
9957
|
-
t_start_sample_us = ggml_time_us();
|
9958
|
-
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
9959
|
-
|
9960
|
-
GGML_ASSERT(n_vocab == candidates->size);
|
9961
|
-
GGML_ASSERT(!candidates->sorted);
|
9962
|
-
|
9963
|
-
std::vector<float> logits_base(n_vocab);
|
9964
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
9965
|
-
logits_base[i] = candidates->data[i].logit;
|
9966
|
-
}
|
9967
|
-
|
9968
|
-
float * logits_guidance = llama_get_logits(guidance_ctx);
|
9969
|
-
|
9970
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9971
|
-
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
|
9972
|
-
t_start_sample_us = ggml_time_us();
|
9973
|
-
|
9974
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
9975
|
-
candidates->data[i].logit = logits_base[i];
|
9976
|
-
}
|
9977
|
-
|
9978
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9979
|
-
}
|
9980
|
-
|
9981
10543
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
9982
10544
|
GGML_ASSERT(ctx);
|
9983
10545
|
|
@@ -10411,7 +10973,7 @@ struct quantize_state_internal {
|
|
10411
10973
|
{}
|
10412
10974
|
};
|
10413
10975
|
|
10414
|
-
static void
|
10976
|
+
static void llama_tensor_dequantize_internal(
|
10415
10977
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
10416
10978
|
const size_t nelements, const int nthread
|
10417
10979
|
) {
|
@@ -10508,31 +11070,47 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10508
11070
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
10509
11071
|
new_type = GGML_TYPE_Q8_0;
|
10510
11072
|
}
|
10511
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype ==
|
11073
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
11074
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10512
11075
|
new_type = GGML_TYPE_Q5_K;
|
10513
11076
|
}
|
10514
11077
|
else if (new_type != GGML_TYPE_Q8_0) {
|
10515
11078
|
new_type = GGML_TYPE_Q6_K;
|
10516
11079
|
}
|
10517
11080
|
} else if (name == "token_embd.weight") {
|
10518
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
11081
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
11082
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10519
11083
|
new_type = GGML_TYPE_Q2_K;
|
10520
11084
|
}
|
11085
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
11086
|
+
new_type = GGML_TYPE_IQ3_S;
|
11087
|
+
}
|
10521
11088
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10522
|
-
new_type =
|
11089
|
+
new_type = GGML_TYPE_IQ3_S;
|
10523
11090
|
}
|
10524
|
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S
|
11091
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
11092
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10525
11093
|
if (name.find("attn_v.weight") != std::string::npos) {
|
10526
11094
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
10527
|
-
else new_type = GGML_TYPE_Q2_K;
|
11095
|
+
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
10528
11096
|
++qs.i_attention_wv;
|
10529
11097
|
}
|
11098
|
+
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
|
11099
|
+
new_type = GGML_TYPE_Q4_K;
|
11100
|
+
}
|
10530
11101
|
else if (name.find("ffn_down") != std::string::npos) {
|
10531
|
-
if (qs.i_ffn_down < qs.n_ffn_down/8)
|
11102
|
+
if (qs.i_ffn_down < qs.n_ffn_down/8) {
|
11103
|
+
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
11104
|
+
}
|
10532
11105
|
++qs.i_ffn_down;
|
10533
11106
|
}
|
10534
11107
|
else if (name.find("attn_output.weight") != std::string::npos) {
|
10535
|
-
if (
|
11108
|
+
if (qs.model.hparams.n_expert == 8) {
|
11109
|
+
new_type = GGML_TYPE_Q5_K;
|
11110
|
+
} else {
|
11111
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
11112
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
11113
|
+
}
|
10536
11114
|
}
|
10537
11115
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
10538
11116
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
@@ -10542,13 +11120,25 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10542
11120
|
new_type = GGML_TYPE_Q4_K;
|
10543
11121
|
}
|
10544
11122
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10545
|
-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ?
|
11123
|
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
11124
|
+
}
|
11125
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
11126
|
+
new_type = GGML_TYPE_Q4_K;
|
11127
|
+
}
|
11128
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
11129
|
+
new_type = GGML_TYPE_Q4_K;
|
11130
|
+
}
|
11131
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
11132
|
+
new_type = GGML_TYPE_Q4_K;
|
11133
|
+
}
|
11134
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
11135
|
+
new_type = GGML_TYPE_Q4_K;
|
10546
11136
|
}
|
10547
11137
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
10548
11138
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
10549
11139
|
}
|
10550
11140
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10551
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
11141
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
|
10552
11142
|
new_type = GGML_TYPE_Q5_K;
|
10553
11143
|
}
|
10554
11144
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
@@ -10574,14 +11164,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10574
11164
|
// TODO: explore better strategies
|
10575
11165
|
new_type = GGML_TYPE_Q8_0;
|
10576
11166
|
}
|
10577
|
-
else if (ftype ==
|
10578
|
-
new_type =
|
11167
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
11168
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
11169
|
+
}
|
11170
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
11171
|
+
new_type = GGML_TYPE_IQ2_S;
|
11172
|
+
}
|
11173
|
+
} else if (name.find("attn_q.weight") != std::string::npos) {
|
11174
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
11175
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
11176
|
+
}
|
11177
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
11178
|
+
new_type = GGML_TYPE_IQ2_S;
|
10579
11179
|
}
|
10580
11180
|
} else if (name.find("ffn_down") != std::string::npos) {
|
10581
11181
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
10582
11182
|
int i_layer = info.first, n_layer = info.second;
|
10583
11183
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
10584
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S
|
11184
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
10585
11185
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
10586
11186
|
}
|
10587
11187
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
@@ -10592,6 +11192,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10592
11192
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
10593
11193
|
: GGML_TYPE_Q3_K;
|
10594
11194
|
}
|
11195
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
11196
|
+
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
11197
|
+
new_type = GGML_TYPE_Q4_K;
|
11198
|
+
}
|
10595
11199
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
10596
11200
|
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
10597
11201
|
}
|
@@ -10603,8 +11207,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10603
11207
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10604
11208
|
}
|
10605
11209
|
}
|
10606
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
10607
|
-
|
11210
|
+
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
|
11211
|
+
new_type = GGML_TYPE_Q5_K;
|
10608
11212
|
}
|
10609
11213
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10610
11214
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
@@ -10621,39 +11225,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10621
11225
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
10622
11226
|
if (arch != LLM_ARCH_FALCON) {
|
10623
11227
|
if (qs.model.hparams.n_expert == 8) {
|
10624
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype ==
|
11228
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
10625
11229
|
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
10626
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M
|
11230
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
11231
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
|
10627
11232
|
new_type = GGML_TYPE_Q5_K;
|
10628
11233
|
}
|
10629
11234
|
} else {
|
10630
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K
|
10631
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type =
|
10632
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
10633
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
11235
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
11236
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
|
11237
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
|
11238
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
|
11239
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
|
10634
11240
|
}
|
10635
11241
|
} else {
|
10636
11242
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
10637
11243
|
}
|
10638
11244
|
}
|
10639
11245
|
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
10640
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L
|
11246
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
11247
|
+
new_type = GGML_TYPE_Q4_K;
|
11248
|
+
}
|
10641
11249
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
10642
11250
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
10643
11251
|
}
|
10644
11252
|
else if (name.find("ffn_gate") != std::string::npos) {
|
10645
11253
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
10646
11254
|
int i_layer = info.first, n_layer = info.second;
|
10647
|
-
if (ftype ==
|
10648
|
-
new_type =
|
11255
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
11256
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10649
11257
|
}
|
10650
11258
|
++qs.i_ffn_gate;
|
10651
11259
|
}
|
10652
11260
|
else if (name.find("ffn_up") != std::string::npos) {
|
10653
11261
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
10654
11262
|
int i_layer = info.first, n_layer = info.second;
|
10655
|
-
if (ftype ==
|
10656
|
-
new_type =
|
11263
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
11264
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10657
11265
|
}
|
10658
11266
|
++qs.i_ffn_up;
|
10659
11267
|
}
|
@@ -10671,9 +11279,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10671
11279
|
//}
|
10672
11280
|
bool convert_incompatible_tensor = false;
|
10673
11281
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
10674
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
10675
|
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
10676
|
-
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
11282
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
11283
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
11284
|
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
|
10677
11285
|
int nx = tensor->ne[0];
|
10678
11286
|
int ny = tensor->ne[1];
|
10679
11287
|
if (nx % QK_K != 0) {
|
@@ -10687,13 +11295,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10687
11295
|
switch (new_type) {
|
10688
11296
|
case GGML_TYPE_IQ2_XXS:
|
10689
11297
|
case GGML_TYPE_IQ2_XS:
|
11298
|
+
case GGML_TYPE_IQ2_S:
|
10690
11299
|
case GGML_TYPE_IQ3_XXS:
|
11300
|
+
case GGML_TYPE_IQ3_S:
|
10691
11301
|
case GGML_TYPE_IQ1_S:
|
10692
11302
|
case GGML_TYPE_Q2_K:
|
10693
|
-
case GGML_TYPE_Q3_K:
|
10694
|
-
case
|
10695
|
-
case
|
10696
|
-
case
|
11303
|
+
case GGML_TYPE_Q3_K:
|
11304
|
+
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
11305
|
+
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
11306
|
+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
11307
|
+
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
10697
11308
|
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
10698
11309
|
}
|
10699
11310
|
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
@@ -10703,6 +11314,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10703
11314
|
return new_type;
|
10704
11315
|
}
|
10705
11316
|
|
11317
|
+
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
11318
|
+
std::mutex mutex;
|
11319
|
+
int counter = 0;
|
11320
|
+
size_t new_size = 0;
|
11321
|
+
if (nthread < 2) {
|
11322
|
+
// single-thread
|
11323
|
+
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
|
11324
|
+
}
|
11325
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
11326
|
+
nrows, n_per_row, imatrix]() {
|
11327
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
11328
|
+
const int nrows_per_chunk = chunk_size / n_per_row;
|
11329
|
+
size_t local_size = 0;
|
11330
|
+
while (true) {
|
11331
|
+
std::unique_lock<std::mutex> lock(mutex);
|
11332
|
+
int first_row = counter; counter += nrows_per_chunk;
|
11333
|
+
if (first_row >= nrows) {
|
11334
|
+
if (local_size > 0) {
|
11335
|
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
11336
|
+
hist_cur[j] += local_hist[j];
|
11337
|
+
}
|
11338
|
+
new_size += local_size;
|
11339
|
+
}
|
11340
|
+
break;
|
11341
|
+
}
|
11342
|
+
lock.unlock();
|
11343
|
+
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
11344
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
11345
|
+
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
11346
|
+
}
|
11347
|
+
};
|
11348
|
+
for (int it = 0; it < nthread - 1; ++it) {
|
11349
|
+
workers.emplace_back(compute);
|
11350
|
+
}
|
11351
|
+
compute();
|
11352
|
+
for (auto & w : workers) { w.join(); }
|
11353
|
+
workers.clear();
|
11354
|
+
return new_size;
|
11355
|
+
}
|
11356
|
+
|
10706
11357
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
10707
11358
|
ggml_type quantized_type;
|
10708
11359
|
llama_ftype ftype = params->ftype;
|
@@ -10719,7 +11370,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10719
11370
|
// K-quants
|
10720
11371
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
10721
11372
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
10722
|
-
case
|
11373
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
|
10723
11374
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
10724
11375
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
10725
11376
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
@@ -10730,9 +11381,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10730
11381
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
10731
11382
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10732
11383
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
11384
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
|
11385
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
|
10733
11386
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10734
11387
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
10735
11388
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
11389
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
|
11390
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
|
11391
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
|
10736
11392
|
|
10737
11393
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
10738
11394
|
}
|
@@ -10810,7 +11466,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10810
11466
|
|
10811
11467
|
std::vector<std::thread> workers;
|
10812
11468
|
workers.reserve(nthread);
|
10813
|
-
std::mutex mutex;
|
10814
11469
|
|
10815
11470
|
int idx = 0;
|
10816
11471
|
|
@@ -10862,7 +11517,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10862
11517
|
quantize &= !params->only_copy;
|
10863
11518
|
|
10864
11519
|
// do not quantize expert gating tensors
|
10865
|
-
|
11520
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
11521
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
10866
11522
|
|
10867
11523
|
// do not quantize positional embeddings and token types (BERT)
|
10868
11524
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
@@ -10906,6 +11562,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10906
11562
|
}
|
10907
11563
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
10908
11564
|
new_type == GGML_TYPE_IQ2_XS ||
|
11565
|
+
new_type == GGML_TYPE_IQ2_S ||
|
10909
11566
|
new_type == GGML_TYPE_IQ1_S ||
|
10910
11567
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
10911
11568
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
@@ -10922,7 +11579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10922
11579
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
10923
11580
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
10924
11581
|
} else {
|
10925
|
-
|
11582
|
+
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
10926
11583
|
f32_data = (float *) f32_conv_buf.data();
|
10927
11584
|
}
|
10928
11585
|
|
@@ -10943,41 +11600,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10943
11600
|
|
10944
11601
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
10945
11602
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
10946
|
-
|
10947
|
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
10948
|
-
} else {
|
10949
|
-
int counter = 0;
|
10950
|
-
new_size = 0;
|
10951
|
-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
10952
|
-
nrows, n_per_row, imatrix]() {
|
10953
|
-
std::array<int64_t, 1 << 4> local_hist = {};
|
10954
|
-
const int nrows_per_chunk = chunk_size / n_per_row;
|
10955
|
-
size_t local_size = 0;
|
10956
|
-
while (true) {
|
10957
|
-
std::unique_lock<std::mutex> lock(mutex);
|
10958
|
-
int first_row = counter; counter += nrows_per_chunk;
|
10959
|
-
if (first_row >= nrows) {
|
10960
|
-
if (local_size > 0) {
|
10961
|
-
for (int j=0; j<int(local_hist.size()); ++j) {
|
10962
|
-
hist_cur[j] += local_hist[j];
|
10963
|
-
}
|
10964
|
-
new_size += local_size;
|
10965
|
-
}
|
10966
|
-
break;
|
10967
|
-
}
|
10968
|
-
lock.unlock();
|
10969
|
-
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
10970
|
-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
10971
|
-
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
10972
|
-
}
|
10973
|
-
};
|
10974
|
-
for (int it = 0; it < nthread_use - 1; ++it) {
|
10975
|
-
workers.emplace_back(compute);
|
10976
|
-
}
|
10977
|
-
compute();
|
10978
|
-
for (auto & w : workers) { w.join(); }
|
10979
|
-
workers.clear();
|
10980
|
-
}
|
11603
|
+
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
|
10981
11604
|
|
10982
11605
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
10983
11606
|
int64_t tot_count = 0;
|
@@ -11327,7 +11950,7 @@ static int llama_apply_lora_from_file_internal(
|
|
11327
11950
|
struct llama_model_params llama_model_default_params() {
|
11328
11951
|
struct llama_model_params result = {
|
11329
11952
|
/*.n_gpu_layers =*/ 0,
|
11330
|
-
/*.split_mode =*/
|
11953
|
+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
11331
11954
|
/*.main_gpu =*/ 0,
|
11332
11955
|
/*.tensor_split =*/ nullptr,
|
11333
11956
|
/*.progress_callback =*/ nullptr,
|
@@ -11353,7 +11976,8 @@ struct llama_context_params llama_context_default_params() {
|
|
11353
11976
|
/*.n_batch =*/ 512,
|
11354
11977
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
11355
11978
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
11356
|
-
/*.rope_scaling_type =*/
|
11979
|
+
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
11980
|
+
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
11357
11981
|
/*.rope_freq_base =*/ 0.0f,
|
11358
11982
|
/*.rope_freq_scale =*/ 0.0f,
|
11359
11983
|
/*.yarn_ext_factor =*/ -1.0f,
|
@@ -11361,15 +11985,16 @@ struct llama_context_params llama_context_default_params() {
|
|
11361
11985
|
/*.yarn_beta_fast =*/ 32.0f,
|
11362
11986
|
/*.yarn_beta_slow =*/ 1.0f,
|
11363
11987
|
/*.yarn_orig_ctx =*/ 0,
|
11988
|
+
/*.defrag_thold =*/ -1.0f,
|
11364
11989
|
/*.cb_eval =*/ nullptr,
|
11365
11990
|
/*.cb_eval_user_data =*/ nullptr,
|
11366
11991
|
/*.type_k =*/ GGML_TYPE_F16,
|
11367
11992
|
/*.type_v =*/ GGML_TYPE_F16,
|
11368
|
-
/*.mul_mat_q =*/ true,
|
11369
11993
|
/*.logits_all =*/ false,
|
11370
|
-
/*.
|
11994
|
+
/*.embeddings =*/ false,
|
11371
11995
|
/*.offload_kqv =*/ true,
|
11372
|
-
/*.
|
11996
|
+
/*.abort_callback =*/ nullptr,
|
11997
|
+
/*.abort_callback_data =*/ nullptr,
|
11373
11998
|
};
|
11374
11999
|
|
11375
12000
|
return result;
|
@@ -11421,15 +12046,6 @@ bool llama_supports_gpu_offload(void) {
|
|
11421
12046
|
#endif
|
11422
12047
|
}
|
11423
12048
|
|
11424
|
-
// deprecated:
|
11425
|
-
bool llama_mmap_supported(void) {
|
11426
|
-
return llama_supports_mmap();
|
11427
|
-
}
|
11428
|
-
|
11429
|
-
bool llama_mlock_supported(void) {
|
11430
|
-
return llama_supports_mlock();
|
11431
|
-
}
|
11432
|
-
|
11433
12049
|
void llama_backend_init(void) {
|
11434
12050
|
ggml_time_init();
|
11435
12051
|
|
@@ -11525,9 +12141,10 @@ struct llama_context * llama_new_context_with_model(
|
|
11525
12141
|
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
11526
12142
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
11527
12143
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
11528
|
-
cparams.
|
12144
|
+
cparams.defrag_thold = params.defrag_thold;
|
12145
|
+
cparams.embeddings = params.embeddings;
|
11529
12146
|
cparams.offload_kqv = params.offload_kqv;
|
11530
|
-
cparams.
|
12147
|
+
cparams.pooling_type = params.pooling_type;
|
11531
12148
|
|
11532
12149
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
11533
12150
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -11541,16 +12158,24 @@ struct llama_context * llama_new_context_with_model(
|
|
11541
12158
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
11542
12159
|
|
11543
12160
|
auto rope_scaling_type = params.rope_scaling_type;
|
11544
|
-
if (rope_scaling_type ==
|
12161
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
11545
12162
|
rope_scaling_type = hparams.rope_scaling_type_train;
|
11546
12163
|
}
|
11547
12164
|
|
11548
|
-
if (rope_scaling_type ==
|
12165
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
|
11549
12166
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
11550
12167
|
}
|
11551
12168
|
|
11552
12169
|
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
11553
|
-
cparams.yarn_ext_factor = rope_scaling_type ==
|
12170
|
+
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
12171
|
+
}
|
12172
|
+
|
12173
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
12174
|
+
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
12175
|
+
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
12176
|
+
} else {
|
12177
|
+
cparams.pooling_type = hparams.pooling_type;
|
12178
|
+
}
|
11554
12179
|
}
|
11555
12180
|
|
11556
12181
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
@@ -11561,8 +12186,11 @@ struct llama_context * llama_new_context_with_model(
|
|
11561
12186
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
11562
12187
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
11563
12188
|
|
11564
|
-
ctx->
|
11565
|
-
ctx->
|
12189
|
+
ctx->abort_callback = params.abort_callback;
|
12190
|
+
ctx->abort_callback_data = params.abort_callback_data;
|
12191
|
+
|
12192
|
+
ctx->rng = std::mt19937(params.seed);
|
12193
|
+
ctx->logits_all = params.logits_all;
|
11566
12194
|
|
11567
12195
|
const ggml_type type_k = params.type_k;
|
11568
12196
|
const ggml_type type_v = params.type_v;
|
@@ -11584,8 +12212,8 @@ struct llama_context * llama_new_context_with_model(
|
|
11584
12212
|
}
|
11585
12213
|
#elif defined(GGML_USE_CUBLAS)
|
11586
12214
|
if (model->n_gpu_layers > 0) {
|
11587
|
-
// with split_mode
|
11588
|
-
if (model->split_mode ==
|
12215
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
12216
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
11589
12217
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
11590
12218
|
if (backend == nullptr) {
|
11591
12219
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
@@ -11594,7 +12222,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11594
12222
|
}
|
11595
12223
|
ctx->backends.push_back(backend);
|
11596
12224
|
} else {
|
11597
|
-
//
|
12225
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
11598
12226
|
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
11599
12227
|
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
11600
12228
|
if (backend == nullptr) {
|
@@ -11620,13 +12248,31 @@ struct llama_context * llama_new_context_with_model(
|
|
11620
12248
|
}
|
11621
12249
|
#elif defined(GGML_USE_SYCL)
|
11622
12250
|
if (model->n_gpu_layers > 0) {
|
11623
|
-
|
11624
|
-
if (
|
11625
|
-
|
11626
|
-
|
11627
|
-
|
12251
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
12252
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
12253
|
+
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
12254
|
+
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
12255
|
+
if (backend == nullptr) {
|
12256
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
12257
|
+
llama_free(ctx);
|
12258
|
+
return nullptr;
|
12259
|
+
}
|
12260
|
+
ctx->backends.push_back(backend);
|
12261
|
+
} else {
|
12262
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
12263
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
12264
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
12265
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
12266
|
+
int device_id = id_list[i];
|
12267
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
12268
|
+
if (backend == nullptr) {
|
12269
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
12270
|
+
llama_free(ctx);
|
12271
|
+
return nullptr;
|
12272
|
+
}
|
12273
|
+
ctx->backends.push_back(backend);
|
12274
|
+
}
|
11628
12275
|
}
|
11629
|
-
ctx->backends.push_back(backend);
|
11630
12276
|
}
|
11631
12277
|
#elif defined(GGML_USE_KOMPUTE)
|
11632
12278
|
if (model->n_gpu_layers > 0) {
|
@@ -11647,8 +12293,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11647
12293
|
}
|
11648
12294
|
ctx->backends.push_back(ctx->backend_cpu);
|
11649
12295
|
|
11650
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
11651
|
-
cparams.n_ctx, cparams.offload_kqv)) {
|
12296
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
|
11652
12297
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
11653
12298
|
llama_free(ctx);
|
11654
12299
|
return nullptr;
|
@@ -11675,8 +12320,8 @@ struct llama_context * llama_new_context_with_model(
|
|
11675
12320
|
// resized during inference, reserve maximum
|
11676
12321
|
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
11677
12322
|
|
11678
|
-
if (params.
|
11679
|
-
ctx->
|
12323
|
+
if (params.embeddings) {
|
12324
|
+
ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
|
11680
12325
|
}
|
11681
12326
|
|
11682
12327
|
// graph inputs
|
@@ -11707,7 +12352,6 @@ struct llama_context * llama_new_context_with_model(
|
|
11707
12352
|
ggml_set_name(ctx->inp_cls, "inp_cls");
|
11708
12353
|
|
11709
12354
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
11710
|
-
|
11711
12355
|
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
11712
12356
|
ggml_backend_buffer_name(ctx->buf_input),
|
11713
12357
|
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
@@ -11727,7 +12371,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11727
12371
|
}
|
11728
12372
|
|
11729
12373
|
// buffer used to store the computation graph and the tensor meta data
|
11730
|
-
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES +
|
12374
|
+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
11731
12375
|
|
11732
12376
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
11733
12377
|
|
@@ -11796,6 +12440,50 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
11796
12440
|
return model->vocab.type;
|
11797
12441
|
}
|
11798
12442
|
|
12443
|
+
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
12444
|
+
switch (model->arch) {
|
12445
|
+
// these models do not use RoPE
|
12446
|
+
case LLM_ARCH_GPT2:
|
12447
|
+
case LLM_ARCH_GPTJ:
|
12448
|
+
case LLM_ARCH_GPTNEOX:
|
12449
|
+
case LLM_ARCH_MPT:
|
12450
|
+
case LLM_ARCH_REFACT:
|
12451
|
+
case LLM_ARCH_BLOOM:
|
12452
|
+
return LLAMA_ROPE_TYPE_NONE;
|
12453
|
+
|
12454
|
+
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
12455
|
+
case LLM_ARCH_LLAMA:
|
12456
|
+
case LLM_ARCH_BAICHUAN:
|
12457
|
+
case LLM_ARCH_STARCODER:
|
12458
|
+
case LLM_ARCH_PLAMO:
|
12459
|
+
case LLM_ARCH_CODESHELL:
|
12460
|
+
case LLM_ARCH_ORION:
|
12461
|
+
case LLM_ARCH_INTERNLM2:
|
12462
|
+
case LLM_ARCH_MINICPM:
|
12463
|
+
return LLAMA_ROPE_TYPE_NORM;
|
12464
|
+
|
12465
|
+
// the pairs of head values are offset by n_rot/2
|
12466
|
+
case LLM_ARCH_FALCON:
|
12467
|
+
case LLM_ARCH_PERSIMMON:
|
12468
|
+
case LLM_ARCH_BERT:
|
12469
|
+
case LLM_ARCH_NOMIC_BERT:
|
12470
|
+
case LLM_ARCH_STABLELM:
|
12471
|
+
case LLM_ARCH_QWEN:
|
12472
|
+
case LLM_ARCH_QWEN2:
|
12473
|
+
case LLM_ARCH_PHI2:
|
12474
|
+
case LLM_ARCH_GEMMA:
|
12475
|
+
case LLM_ARCH_STARCODER2:
|
12476
|
+
return LLAMA_ROPE_TYPE_NEOX;
|
12477
|
+
|
12478
|
+
// all model arches should be listed explicitly here
|
12479
|
+
case LLM_ARCH_UNKNOWN:
|
12480
|
+
GGML_ASSERT(false && "unknown architecture");
|
12481
|
+
break;
|
12482
|
+
}
|
12483
|
+
|
12484
|
+
return LLAMA_ROPE_TYPE_NONE;
|
12485
|
+
}
|
12486
|
+
|
11799
12487
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
11800
12488
|
return model->vocab.id_to_token.size();
|
11801
12489
|
}
|
@@ -11898,15 +12586,6 @@ uint32_t llama_model_quantize(
|
|
11898
12586
|
}
|
11899
12587
|
}
|
11900
12588
|
|
11901
|
-
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
11902
|
-
try {
|
11903
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
11904
|
-
} catch (const std::exception & err) {
|
11905
|
-
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
11906
|
-
return 1;
|
11907
|
-
}
|
11908
|
-
}
|
11909
|
-
|
11910
12589
|
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
11911
12590
|
try {
|
11912
12591
|
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
@@ -12038,12 +12717,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
12038
12717
|
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
12039
12718
|
}
|
12040
12719
|
|
12041
|
-
void
|
12720
|
+
void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
12042
12721
|
if (delta == 0) {
|
12043
12722
|
return;
|
12044
12723
|
}
|
12045
12724
|
|
12046
|
-
|
12725
|
+
llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
|
12047
12726
|
}
|
12048
12727
|
|
12049
12728
|
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
@@ -12054,6 +12733,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
|
|
12054
12733
|
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
12055
12734
|
}
|
12056
12735
|
|
12736
|
+
llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
|
12737
|
+
return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
|
12738
|
+
}
|
12739
|
+
|
12740
|
+
void llama_kv_cache_defrag(struct llama_context * ctx) {
|
12741
|
+
llama_kv_cache_defrag(ctx->kv_self);
|
12742
|
+
}
|
12743
|
+
|
12744
|
+
void llama_kv_cache_update(struct llama_context * ctx) {
|
12745
|
+
llama_kv_cache_update_internal(*ctx);
|
12746
|
+
}
|
12747
|
+
|
12748
|
+
|
12057
12749
|
// Returns the *maximum* size of the state
|
12058
12750
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
12059
12751
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
@@ -12064,10 +12756,15 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
12064
12756
|
// assume worst case for logits although only currently set ones are serialized
|
12065
12757
|
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
12066
12758
|
const size_t s_embedding_size = sizeof(size_t);
|
12067
|
-
const size_t s_embedding = ctx->
|
12068
|
-
const size_t
|
12069
|
-
const size_t
|
12759
|
+
const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
|
12760
|
+
const size_t s_kv_buf_size = sizeof(size_t);
|
12761
|
+
const size_t s_kv_head = sizeof(uint32_t);
|
12762
|
+
const size_t s_kv_size = sizeof(uint32_t);
|
12763
|
+
const size_t s_kv_used = sizeof(uint32_t);
|
12070
12764
|
const size_t s_kv = ctx->kv_self.total_size();
|
12765
|
+
// TODO: assume the max is more than 1 seq_id per KV cell
|
12766
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
12767
|
+
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
12071
12768
|
|
12072
12769
|
const size_t s_total = (
|
12073
12770
|
+ s_rng_size
|
@@ -12076,9 +12773,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
12076
12773
|
+ s_logits
|
12077
12774
|
+ s_embedding_size
|
12078
12775
|
+ s_embedding
|
12776
|
+
+ s_kv_buf_size
|
12777
|
+
+ s_kv_head
|
12079
12778
|
+ s_kv_size
|
12080
|
-
+
|
12779
|
+
+ s_kv_used
|
12081
12780
|
+ s_kv
|
12781
|
+
+ s_kv_cells
|
12082
12782
|
);
|
12083
12783
|
|
12084
12784
|
return s_total;
|
@@ -12165,12 +12865,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12165
12865
|
|
12166
12866
|
// copy embeddings
|
12167
12867
|
{
|
12168
|
-
const size_t
|
12868
|
+
const size_t embeddings_size = ctx->embd.size();
|
12169
12869
|
|
12170
|
-
data_ctx->write(&
|
12870
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
12171
12871
|
|
12172
|
-
if (
|
12173
|
-
data_ctx->write(ctx->
|
12872
|
+
if (embeddings_size) {
|
12873
|
+
data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
|
12174
12874
|
}
|
12175
12875
|
}
|
12176
12876
|
|
@@ -12178,15 +12878,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12178
12878
|
{
|
12179
12879
|
const auto & kv_self = ctx->kv_self;
|
12180
12880
|
const auto & hparams = ctx->model.hparams;
|
12181
|
-
const auto & cparams = ctx->cparams;
|
12182
12881
|
|
12183
|
-
const
|
12184
|
-
const
|
12185
|
-
const
|
12186
|
-
const auto n_ctx = cparams.n_ctx;
|
12882
|
+
const uint32_t n_layer = hparams.n_layer;
|
12883
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12884
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12187
12885
|
|
12188
12886
|
const size_t kv_buf_size = kv_self.total_size();
|
12189
|
-
const uint32_t kv_head = kv_self
|
12887
|
+
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
12190
12888
|
const uint32_t kv_size = kv_self.size;
|
12191
12889
|
const uint32_t kv_used = kv_self.used;
|
12192
12890
|
|
@@ -12198,14 +12896,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12198
12896
|
if (kv_buf_size) {
|
12199
12897
|
std::vector<uint8_t> tmp_buf;
|
12200
12898
|
for (int il = 0; il < (int) n_layer; ++il) {
|
12201
|
-
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12899
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12900
|
+
|
12202
12901
|
tmp_buf.resize(k_size);
|
12203
12902
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
12204
12903
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
12205
12904
|
|
12206
12905
|
// v is not contiguous, copy row by row
|
12207
|
-
size_t v_row_size
|
12208
|
-
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
12906
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12907
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
12908
|
+
|
12209
12909
|
tmp_buf.resize(v_row_size);
|
12210
12910
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12211
12911
|
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
@@ -12214,7 +12914,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12214
12914
|
}
|
12215
12915
|
}
|
12216
12916
|
|
12217
|
-
for (uint32_t i = 0; i <
|
12917
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
12218
12918
|
const auto & cell = kv_self.cells[i];
|
12219
12919
|
|
12220
12920
|
const llama_pos pos = cell.pos;
|
@@ -12238,8 +12938,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
12238
12938
|
}
|
12239
12939
|
|
12240
12940
|
// Sets the state reading from the specified source address
|
12241
|
-
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
12242
|
-
uint8_t * inp = src;
|
12941
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
12942
|
+
const uint8_t * inp = src;
|
12243
12943
|
|
12244
12944
|
// set rng
|
12245
12945
|
{
|
@@ -12248,7 +12948,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12248
12948
|
|
12249
12949
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
12250
12950
|
|
12251
|
-
std::string rng_str((char *)inp, rng_size); inp += rng_size;
|
12951
|
+
std::string rng_str((const char *)inp, rng_size); inp += rng_size;
|
12252
12952
|
|
12253
12953
|
std::istringstream rng_ss(rng_str);
|
12254
12954
|
rng_ss >> ctx->rng;
|
@@ -12274,15 +12974,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12274
12974
|
|
12275
12975
|
// set embeddings
|
12276
12976
|
{
|
12277
|
-
size_t
|
12977
|
+
size_t embeddings_size;
|
12978
|
+
|
12979
|
+
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
12278
12980
|
|
12279
|
-
|
12981
|
+
GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
|
12280
12982
|
|
12281
|
-
|
12983
|
+
if (embeddings_size) {
|
12984
|
+
ctx->embd.resize(embeddings_size);
|
12282
12985
|
|
12283
|
-
|
12284
|
-
|
12285
|
-
inp += embedding_size * sizeof(float);
|
12986
|
+
memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
|
12987
|
+
inp += embeddings_size * sizeof(float);
|
12286
12988
|
}
|
12287
12989
|
}
|
12288
12990
|
|
@@ -12290,12 +12992,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12290
12992
|
{
|
12291
12993
|
const auto & kv_self = ctx->kv_self;
|
12292
12994
|
const auto & hparams = ctx->model.hparams;
|
12293
|
-
const auto & cparams = ctx->cparams;
|
12294
12995
|
|
12295
|
-
const
|
12296
|
-
const
|
12297
|
-
const
|
12298
|
-
const int n_ctx = cparams.n_ctx;
|
12996
|
+
const uint32_t n_layer = hparams.n_layer;
|
12997
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12998
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12299
12999
|
|
12300
13000
|
size_t kv_buf_size;
|
12301
13001
|
uint32_t kv_head;
|
@@ -12311,13 +13011,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12311
13011
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
12312
13012
|
|
12313
13013
|
for (int il = 0; il < (int) n_layer; ++il) {
|
12314
|
-
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
13014
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
13015
|
+
|
12315
13016
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
12316
13017
|
inp += k_size;
|
12317
13018
|
|
12318
13019
|
// v is not contiguous, copy row by row
|
12319
|
-
size_t v_row_size
|
12320
|
-
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
13020
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
13021
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
13022
|
+
|
12321
13023
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12322
13024
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
12323
13025
|
inp += v_row_size;
|
@@ -12325,13 +13027,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12325
13027
|
}
|
12326
13028
|
}
|
12327
13029
|
|
13030
|
+
GGML_ASSERT(kv_self.size == kv_size);
|
13031
|
+
|
12328
13032
|
ctx->kv_self.head = kv_head;
|
12329
13033
|
ctx->kv_self.size = kv_size;
|
12330
13034
|
ctx->kv_self.used = kv_used;
|
12331
13035
|
|
12332
13036
|
ctx->kv_self.cells.resize(kv_size);
|
12333
13037
|
|
12334
|
-
for (uint32_t i = 0; i <
|
13038
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
12335
13039
|
llama_pos pos;
|
12336
13040
|
size_t seq_id_size;
|
12337
13041
|
|
@@ -12347,6 +13051,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12347
13051
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
12348
13052
|
}
|
12349
13053
|
}
|
13054
|
+
|
13055
|
+
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
13056
|
+
ctx->kv_self.cells[i].pos = -1;
|
13057
|
+
ctx->kv_self.cells[i].seq_id.clear();
|
13058
|
+
}
|
12350
13059
|
}
|
12351
13060
|
|
12352
13061
|
const size_t nread = inp - src;
|
@@ -12439,43 +13148,16 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
12439
13148
|
return true;
|
12440
13149
|
}
|
12441
13150
|
|
12442
|
-
int llama_eval(
|
12443
|
-
struct llama_context * ctx,
|
12444
|
-
llama_token * tokens,
|
12445
|
-
int32_t n_tokens,
|
12446
|
-
int32_t n_past) {
|
12447
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
12448
|
-
|
12449
|
-
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
12450
|
-
if (ret < 0) {
|
12451
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
12452
|
-
}
|
12453
|
-
|
12454
|
-
return ret;
|
12455
|
-
}
|
12456
|
-
|
12457
|
-
int llama_eval_embd(
|
12458
|
-
struct llama_context * ctx,
|
12459
|
-
float * embd,
|
12460
|
-
int32_t n_tokens,
|
12461
|
-
int32_t n_past) {
|
12462
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
12463
|
-
|
12464
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
12465
|
-
|
12466
|
-
const int ret = llama_decode_internal(*ctx, batch);
|
12467
|
-
if (ret < 0) {
|
12468
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
12469
|
-
}
|
12470
|
-
|
12471
|
-
return ret;
|
12472
|
-
}
|
12473
|
-
|
12474
13151
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
12475
13152
|
ctx->cparams.n_threads = n_threads;
|
12476
13153
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
12477
13154
|
}
|
12478
13155
|
|
13156
|
+
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
13157
|
+
ctx->abort_callback = abort_callback;
|
13158
|
+
ctx->abort_callback_data = abort_callback_data;
|
13159
|
+
}
|
13160
|
+
|
12479
13161
|
struct llama_batch llama_batch_get_one(
|
12480
13162
|
llama_token * tokens,
|
12481
13163
|
int32_t n_tokens,
|
@@ -12552,11 +13234,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|
12552
13234
|
}
|
12553
13235
|
|
12554
13236
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
12555
|
-
return ctx->
|
13237
|
+
return ctx->embd.data();
|
12556
13238
|
}
|
12557
13239
|
|
12558
13240
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
12559
|
-
return ctx->
|
13241
|
+
return ctx->embd.data() + i*ctx->model.hparams.n_embd;
|
13242
|
+
}
|
13243
|
+
|
13244
|
+
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
13245
|
+
auto it = ctx->embd_seq.find(seq_id);
|
13246
|
+
if (it == ctx->embd_seq.end()) {
|
13247
|
+
return nullptr;
|
13248
|
+
}
|
13249
|
+
|
13250
|
+
return it->second.data();
|
12560
13251
|
}
|
12561
13252
|
|
12562
13253
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
@@ -12730,7 +13421,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12730
13421
|
std::string & dest, bool add_ass) {
|
12731
13422
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
12732
13423
|
std::stringstream ss;
|
12733
|
-
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
13424
|
+
if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
|
12734
13425
|
// chatml template
|
12735
13426
|
for (auto message : chat) {
|
12736
13427
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
@@ -12738,7 +13429,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12738
13429
|
if (add_ass) {
|
12739
13430
|
ss << "<|im_start|>assistant\n";
|
12740
13431
|
}
|
12741
|
-
} else if (tmpl.find("[INST]") != std::string::npos) {
|
13432
|
+
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
|
12742
13433
|
// llama2 template and its variants
|
12743
13434
|
// [variant] support system message
|
12744
13435
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
@@ -12773,7 +13464,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12773
13464
|
}
|
12774
13465
|
}
|
12775
13466
|
// llama2 templates seem to not care about "add_generation_prompt"
|
12776
|
-
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
13467
|
+
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
12777
13468
|
// zephyr template
|
12778
13469
|
for (auto message : chat) {
|
12779
13470
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
@@ -12781,7 +13472,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12781
13472
|
if (add_ass) {
|
12782
13473
|
ss << "<|assistant|>\n";
|
12783
13474
|
}
|
12784
|
-
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
13475
|
+
} else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
|
12785
13476
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
12786
13477
|
for (auto message : chat) {
|
12787
13478
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
@@ -12790,7 +13481,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12790
13481
|
if (add_ass) {
|
12791
13482
|
ss << "<s>assistant\n";
|
12792
13483
|
}
|
12793
|
-
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
13484
|
+
} else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
12794
13485
|
// google/gemma-7b-it
|
12795
13486
|
std::string system_prompt = "";
|
12796
13487
|
for (auto message : chat) {
|
@@ -12837,23 +13528,27 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
12837
13528
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
12838
13529
|
if (res < 0) {
|
12839
13530
|
// worst case: there is no information about template, we will use chatml by default
|
12840
|
-
curr_tmpl = "
|
13531
|
+
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
12841
13532
|
} else {
|
12842
13533
|
curr_tmpl = std::string(model_template.data(), model_template.size());
|
12843
13534
|
}
|
12844
13535
|
}
|
13536
|
+
|
12845
13537
|
// format the chat to string
|
12846
13538
|
std::vector<const llama_chat_message *> chat_vec;
|
12847
13539
|
chat_vec.resize(n_msg);
|
12848
13540
|
for (size_t i = 0; i < n_msg; i++) {
|
12849
13541
|
chat_vec[i] = &chat[i];
|
12850
13542
|
}
|
13543
|
+
|
12851
13544
|
std::string formatted_chat;
|
12852
13545
|
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
12853
13546
|
if (res < 0) {
|
12854
13547
|
return res;
|
12855
13548
|
}
|
12856
|
-
|
13549
|
+
if (buf && length > 0) {
|
13550
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
13551
|
+
}
|
12857
13552
|
return res;
|
12858
13553
|
}
|
12859
13554
|
|