llama_cpp 0.12.7 → 0.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +131 -288
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +29 -29
- data/vendor/tmp/llama.cpp/Makefile +10 -6
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -23
- data/vendor/tmp/llama.cpp/ggml-backend.h +17 -16
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +159 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1195 -139
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +27 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +1971 -271
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3586 -1201
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1391 -825
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +545 -210
- data/vendor/tmp/llama.cpp/ggml.h +65 -23
- data/vendor/tmp/llama.cpp/llama.cpp +1458 -763
- data/vendor/tmp/llama.cpp/llama.h +81 -75
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -68,10 +68,12 @@
|
|
68
68
|
#include <cstdio>
|
69
69
|
#include <cstring>
|
70
70
|
#include <ctime>
|
71
|
+
#include <cwctype>
|
71
72
|
#include <forward_list>
|
72
73
|
#include <fstream>
|
73
74
|
#include <functional>
|
74
75
|
#include <initializer_list>
|
76
|
+
#include <locale>
|
75
77
|
#include <map>
|
76
78
|
#include <memory>
|
77
79
|
#include <mutex>
|
@@ -102,6 +104,7 @@
|
|
102
104
|
#define LLAMA_MAX_NODES 8192
|
103
105
|
#define LLAMA_MAX_EXPERTS 8
|
104
106
|
|
107
|
+
|
105
108
|
//
|
106
109
|
// logging
|
107
110
|
//
|
@@ -209,10 +212,11 @@ enum llm_arch {
|
|
209
212
|
LLM_ARCH_INTERNLM2,
|
210
213
|
LLM_ARCH_MINICPM,
|
211
214
|
LLM_ARCH_GEMMA,
|
215
|
+
LLM_ARCH_STARCODER2,
|
212
216
|
LLM_ARCH_UNKNOWN,
|
213
217
|
};
|
214
218
|
|
215
|
-
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
219
|
+
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
216
220
|
{ LLM_ARCH_LLAMA, "llama" },
|
217
221
|
{ LLM_ARCH_FALCON, "falcon" },
|
218
222
|
{ LLM_ARCH_GPT2, "gpt2" },
|
@@ -236,6 +240,8 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
236
240
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
237
241
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
238
242
|
{ LLM_ARCH_GEMMA, "gemma" },
|
243
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
244
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
239
245
|
};
|
240
246
|
|
241
247
|
enum llm_kv {
|
@@ -296,7 +302,7 @@ enum llm_kv {
|
|
296
302
|
LLM_KV_TOKENIZER_RWKV,
|
297
303
|
};
|
298
304
|
|
299
|
-
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
305
|
+
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
300
306
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
301
307
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
302
308
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
@@ -360,7 +366,7 @@ struct LLM_KV {
|
|
360
366
|
llm_arch arch;
|
361
367
|
|
362
368
|
std::string operator()(llm_kv kv) const {
|
363
|
-
return ::format(LLM_KV_NAMES
|
369
|
+
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
364
370
|
}
|
365
371
|
};
|
366
372
|
|
@@ -395,7 +401,7 @@ enum llm_tensor {
|
|
395
401
|
LLM_TENSOR_LAYER_OUT_NORM,
|
396
402
|
};
|
397
403
|
|
398
|
-
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
404
|
+
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
399
405
|
{
|
400
406
|
LLM_ARCH_LLAMA,
|
401
407
|
{
|
@@ -777,6 +783,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
777
783
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
778
784
|
},
|
779
785
|
},
|
786
|
+
{
|
787
|
+
LLM_ARCH_STARCODER2,
|
788
|
+
{
|
789
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
790
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
791
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
792
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
793
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
794
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
795
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
796
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
797
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
798
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
799
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
800
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
801
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
802
|
+
},
|
803
|
+
},
|
780
804
|
{
|
781
805
|
LLM_ARCH_UNKNOWN,
|
782
806
|
{
|
@@ -810,38 +834,38 @@ struct LLM_TN {
|
|
810
834
|
llm_arch arch;
|
811
835
|
|
812
836
|
std::string operator()(llm_tensor tensor) const {
|
813
|
-
if (LLM_TENSOR_NAMES
|
837
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
814
838
|
return "__missing__";
|
815
839
|
}
|
816
|
-
return LLM_TENSOR_NAMES
|
840
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
817
841
|
}
|
818
842
|
|
819
843
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
820
|
-
if (LLM_TENSOR_NAMES
|
844
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
821
845
|
return "__missing__";
|
822
846
|
}
|
823
|
-
return LLM_TENSOR_NAMES
|
847
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
824
848
|
}
|
825
849
|
|
826
850
|
std::string operator()(llm_tensor tensor, int bid) const {
|
827
|
-
if (LLM_TENSOR_NAMES
|
851
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
828
852
|
return "__missing__";
|
829
853
|
}
|
830
|
-
return ::format(LLM_TENSOR_NAMES
|
854
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
|
831
855
|
}
|
832
856
|
|
833
857
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
834
|
-
if (LLM_TENSOR_NAMES
|
858
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
835
859
|
return "__missing__";
|
836
860
|
}
|
837
|
-
return ::format(LLM_TENSOR_NAMES
|
861
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
|
838
862
|
}
|
839
863
|
|
840
864
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
841
|
-
if (LLM_TENSOR_NAMES
|
865
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
842
866
|
return "__missing__";
|
843
867
|
}
|
844
|
-
return ::format(LLM_TENSOR_NAMES
|
868
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
|
845
869
|
}
|
846
870
|
};
|
847
871
|
|
@@ -849,20 +873,20 @@ struct LLM_TN {
|
|
849
873
|
// gguf helpers
|
850
874
|
//
|
851
875
|
|
852
|
-
static std::map<
|
853
|
-
{
|
854
|
-
{
|
855
|
-
{
|
876
|
+
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
877
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
878
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
879
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
856
880
|
};
|
857
881
|
|
858
|
-
static
|
882
|
+
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
859
883
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
860
884
|
if (kv.second == name) {
|
861
|
-
return kv.first;
|
885
|
+
return (llama_rope_scaling_type) kv.first;
|
862
886
|
}
|
863
887
|
}
|
864
888
|
|
865
|
-
return
|
889
|
+
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
866
890
|
}
|
867
891
|
|
868
892
|
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
@@ -1407,7 +1431,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1407
1431
|
buft = ggml_backend_cuda_host_buffer_type();
|
1408
1432
|
}
|
1409
1433
|
#elif defined(GGML_USE_SYCL)
|
1410
|
-
|
1434
|
+
if (host_buffer) {
|
1435
|
+
buft = ggml_backend_sycl_host_buffer_type();
|
1436
|
+
}
|
1411
1437
|
#elif defined(GGML_USE_CPU_HBM)
|
1412
1438
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
1413
1439
|
#elif defined(GGML_USE_VULKAN)
|
@@ -1461,6 +1487,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1461
1487
|
}
|
1462
1488
|
#endif
|
1463
1489
|
|
1490
|
+
#ifdef GGML_USE_SYCL
|
1491
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
1492
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1493
|
+
}
|
1494
|
+
#endif
|
1495
|
+
|
1464
1496
|
if (buft == nullptr) {
|
1465
1497
|
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1466
1498
|
}
|
@@ -1472,6 +1504,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1472
1504
|
static size_t llama_get_device_count() {
|
1473
1505
|
#if defined(GGML_USE_CUBLAS)
|
1474
1506
|
return ggml_backend_cuda_get_device_count();
|
1507
|
+
#elif defined(GGML_USE_SYCL)
|
1508
|
+
return ggml_backend_sycl_get_device_count();
|
1475
1509
|
#elif defined(GGML_USE_VULKAN)
|
1476
1510
|
return ggml_backend_vk_get_device_count();
|
1477
1511
|
#else
|
@@ -1485,6 +1519,11 @@ static size_t llama_get_device_memory(int device) {
|
|
1485
1519
|
size_t free;
|
1486
1520
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
1487
1521
|
return free;
|
1522
|
+
#elif defined(GGML_USE_SYCL)
|
1523
|
+
size_t total;
|
1524
|
+
size_t free;
|
1525
|
+
ggml_backend_sycl_get_device_memory(device, &total, &free);
|
1526
|
+
return free;
|
1488
1527
|
#elif defined(GGML_USE_VULKAN)
|
1489
1528
|
size_t total;
|
1490
1529
|
size_t free;
|
@@ -1550,8 +1589,9 @@ static const size_t MiB = 1024*kiB;
|
|
1550
1589
|
static const size_t GiB = 1024*MiB;
|
1551
1590
|
|
1552
1591
|
struct llama_hparams {
|
1553
|
-
bool
|
1554
|
-
bool
|
1592
|
+
bool vocab_only;
|
1593
|
+
bool rope_finetuned;
|
1594
|
+
|
1555
1595
|
uint32_t n_vocab;
|
1556
1596
|
uint32_t n_ctx_train; // context size the model was trained on
|
1557
1597
|
uint32_t n_embd;
|
@@ -1572,7 +1612,6 @@ struct llama_hparams {
|
|
1572
1612
|
float rope_freq_base_train;
|
1573
1613
|
float rope_freq_scale_train;
|
1574
1614
|
uint32_t n_yarn_orig_ctx;
|
1575
|
-
int32_t rope_scaling_type_train;
|
1576
1615
|
|
1577
1616
|
float f_clamp_kqv = 0.0f;
|
1578
1617
|
float f_max_alibi_bias = 0.0f;
|
@@ -1580,7 +1619,9 @@ struct llama_hparams {
|
|
1580
1619
|
bool causal_attn = true;
|
1581
1620
|
bool need_kq_pos = false;
|
1582
1621
|
|
1583
|
-
|
1622
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1623
|
+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
1624
|
+
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
1584
1625
|
|
1585
1626
|
bool operator!=(const llama_hparams & other) const {
|
1586
1627
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1624,13 +1665,13 @@ struct llama_hparams {
|
|
1624
1665
|
};
|
1625
1666
|
|
1626
1667
|
struct llama_cparams {
|
1627
|
-
uint32_t n_ctx;
|
1668
|
+
uint32_t n_ctx; // context size used during inference
|
1628
1669
|
uint32_t n_batch;
|
1629
1670
|
uint32_t n_threads; // number of threads to use for generation
|
1630
1671
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1631
1672
|
|
1632
|
-
float
|
1633
|
-
float
|
1673
|
+
float rope_freq_base;
|
1674
|
+
float rope_freq_scale;
|
1634
1675
|
|
1635
1676
|
uint32_t n_yarn_orig_ctx;
|
1636
1677
|
// These hyperparameters are not exposed in GGUF, because all
|
@@ -1639,10 +1680,12 @@ struct llama_cparams {
|
|
1639
1680
|
float yarn_attn_factor;
|
1640
1681
|
float yarn_beta_fast;
|
1641
1682
|
float yarn_beta_slow;
|
1683
|
+
float defrag_thold;
|
1642
1684
|
|
1643
|
-
bool
|
1685
|
+
bool embeddings;
|
1644
1686
|
bool offload_kqv;
|
1645
|
-
|
1687
|
+
|
1688
|
+
enum llama_pooling_type pooling_type;
|
1646
1689
|
|
1647
1690
|
ggml_backend_sched_eval_callback cb_eval;
|
1648
1691
|
void * cb_eval_user_data;
|
@@ -1707,11 +1750,20 @@ struct llama_kv_cell {
|
|
1707
1750
|
bool has_seq_id(const llama_seq_id & id) const {
|
1708
1751
|
return seq_id.find(id) != seq_id.end();
|
1709
1752
|
}
|
1753
|
+
|
1754
|
+
bool is_empty() const {
|
1755
|
+
return seq_id.empty();
|
1756
|
+
}
|
1757
|
+
|
1758
|
+
bool is_same_seq(const llama_kv_cell & other) const {
|
1759
|
+
return seq_id == other.seq_id;
|
1760
|
+
}
|
1710
1761
|
};
|
1711
1762
|
|
1712
1763
|
// ring-buffer of cached KV data
|
1713
1764
|
struct llama_kv_cache {
|
1714
1765
|
bool has_shift = false;
|
1766
|
+
bool do_defrag = false;
|
1715
1767
|
|
1716
1768
|
// Note: The value of head isn't only used to optimize searching
|
1717
1769
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -1723,6 +1775,9 @@ struct llama_kv_cache {
|
|
1723
1775
|
// computed before each graph build
|
1724
1776
|
uint32_t n = 0;
|
1725
1777
|
|
1778
|
+
ggml_type type_k = GGML_TYPE_F16;
|
1779
|
+
ggml_type type_v = GGML_TYPE_F16;
|
1780
|
+
|
1726
1781
|
std::vector<llama_kv_cell> cells;
|
1727
1782
|
|
1728
1783
|
std::vector<struct ggml_tensor *> k_l; // per layer
|
@@ -1919,7 +1974,7 @@ struct llama_context {
|
|
1919
1974
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1920
1975
|
int32_t n_eval = 0; // number of eval calls
|
1921
1976
|
|
1922
|
-
//
|
1977
|
+
// logits output (2-dimensional array: [n_tokens][n_vocab])
|
1923
1978
|
std::vector<float> logits;
|
1924
1979
|
#ifndef NDEBUG
|
1925
1980
|
// guard against access to unset logits
|
@@ -1927,13 +1982,21 @@ struct llama_context {
|
|
1927
1982
|
#endif
|
1928
1983
|
bool logits_all = false;
|
1929
1984
|
|
1930
|
-
//
|
1931
|
-
|
1985
|
+
// embeddings output (2-dimensional array: [n_tokens][n_embd])
|
1986
|
+
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
1987
|
+
std::vector<float> embd;
|
1988
|
+
|
1989
|
+
// sequence embeddings output (map of [n_embd] vectors)
|
1990
|
+
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
1991
|
+
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
1932
1992
|
|
1933
1993
|
// memory buffers used to evaluate the model
|
1934
1994
|
std::vector<uint8_t> buf_compute_meta;
|
1935
1995
|
ggml_backend_sched_t sched = nullptr;
|
1936
1996
|
|
1997
|
+
ggml_abort_callback abort_callback = nullptr;
|
1998
|
+
void * abort_callback_data = nullptr;
|
1999
|
+
|
1937
2000
|
// input tensors
|
1938
2001
|
ggml_backend_buffer_t buf_input = nullptr;
|
1939
2002
|
ggml_context * ctx_input = nullptr;
|
@@ -1958,8 +2021,8 @@ struct llama_context {
|
|
1958
2021
|
static bool llama_kv_cache_init(
|
1959
2022
|
struct llama_kv_cache & cache,
|
1960
2023
|
const llama_model & model,
|
1961
|
-
ggml_type
|
1962
|
-
ggml_type
|
2024
|
+
ggml_type type_k,
|
2025
|
+
ggml_type type_v,
|
1963
2026
|
uint32_t n_ctx,
|
1964
2027
|
bool offload) {
|
1965
2028
|
const struct llama_hparams & hparams = model.hparams;
|
@@ -1974,6 +2037,9 @@ static bool llama_kv_cache_init(
|
|
1974
2037
|
cache.size = n_ctx;
|
1975
2038
|
cache.used = 0;
|
1976
2039
|
|
2040
|
+
cache.type_k = type_k;
|
2041
|
+
cache.type_v = type_v;
|
2042
|
+
|
1977
2043
|
cache.cells.clear();
|
1978
2044
|
cache.cells.resize(n_ctx);
|
1979
2045
|
|
@@ -2014,8 +2080,8 @@ static bool llama_kv_cache_init(
|
|
2014
2080
|
|
2015
2081
|
for (int i = 0; i < (int) n_layer; i++) {
|
2016
2082
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
2017
|
-
ggml_tensor * k = ggml_new_tensor_1d(ctx,
|
2018
|
-
ggml_tensor * v = ggml_new_tensor_1d(ctx,
|
2083
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
|
2084
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
|
2019
2085
|
ggml_format_name(k, "cache_k_l%d", i);
|
2020
2086
|
ggml_format_name(v, "cache_v_l%d", i);
|
2021
2087
|
cache.k_l.push_back(k);
|
@@ -2097,10 +2163,12 @@ static bool llama_kv_cache_find_slot(
|
|
2097
2163
|
}
|
2098
2164
|
|
2099
2165
|
// find how many cells are currently in use
|
2100
|
-
static
|
2101
|
-
for (uint32_t i = cache.size
|
2102
|
-
|
2103
|
-
|
2166
|
+
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
2167
|
+
for (uint32_t i = cache.size; i > 0; --i) {
|
2168
|
+
const llama_kv_cell & cell = cache.cells[i - 1];
|
2169
|
+
|
2170
|
+
if (cell.pos >= 0 && !cell.is_empty()) {
|
2171
|
+
return i;
|
2104
2172
|
}
|
2105
2173
|
}
|
2106
2174
|
|
@@ -2135,7 +2203,7 @@ static void llama_kv_cache_seq_rm(
|
|
2135
2203
|
} else {
|
2136
2204
|
continue;
|
2137
2205
|
}
|
2138
|
-
if (cache.cells[i].
|
2206
|
+
if (cache.cells[i].is_empty()) {
|
2139
2207
|
// keep count of the number of used cells
|
2140
2208
|
if (cache.cells[i].pos >= 0) cache.used--;
|
2141
2209
|
|
@@ -2186,7 +2254,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
2186
2254
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
2187
2255
|
}
|
2188
2256
|
|
2189
|
-
static void
|
2257
|
+
static void llama_kv_cache_seq_add(
|
2190
2258
|
struct llama_kv_cache & cache,
|
2191
2259
|
llama_seq_id seq_id,
|
2192
2260
|
llama_pos p0,
|
@@ -2204,10 +2272,14 @@ static void llama_kv_cache_seq_shift(
|
|
2204
2272
|
cache.cells[i].delta += delta;
|
2205
2273
|
|
2206
2274
|
if (cache.cells[i].pos < 0) {
|
2207
|
-
if (!cache.cells[i].
|
2275
|
+
if (!cache.cells[i].is_empty()) {
|
2276
|
+
cache.used--;
|
2277
|
+
}
|
2208
2278
|
cache.cells[i].pos = -1;
|
2209
2279
|
cache.cells[i].seq_id.clear();
|
2210
|
-
if (new_head == cache.size)
|
2280
|
+
if (new_head == cache.size) {
|
2281
|
+
new_head = i;
|
2282
|
+
}
|
2211
2283
|
}
|
2212
2284
|
}
|
2213
2285
|
}
|
@@ -2239,6 +2311,22 @@ static void llama_kv_cache_seq_div(
|
|
2239
2311
|
}
|
2240
2312
|
}
|
2241
2313
|
|
2314
|
+
static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
2315
|
+
llama_pos result = 0;
|
2316
|
+
|
2317
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
2318
|
+
if (cache.cells[i].has_seq_id(seq_id)) {
|
2319
|
+
result = std::max(result, cache.cells[i].pos);
|
2320
|
+
}
|
2321
|
+
}
|
2322
|
+
|
2323
|
+
return result;
|
2324
|
+
}
|
2325
|
+
|
2326
|
+
static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
2327
|
+
cache.do_defrag = true;
|
2328
|
+
}
|
2329
|
+
|
2242
2330
|
//
|
2243
2331
|
// model loading and saving
|
2244
2332
|
//
|
@@ -2310,7 +2398,7 @@ namespace GGUFMeta {
|
|
2310
2398
|
}
|
2311
2399
|
};
|
2312
2400
|
|
2313
|
-
struct ArrayInfo{
|
2401
|
+
struct ArrayInfo {
|
2314
2402
|
const gguf_type gt;
|
2315
2403
|
const size_t length;
|
2316
2404
|
const void * data;
|
@@ -2329,7 +2417,7 @@ namespace GGUFMeta {
|
|
2329
2417
|
};
|
2330
2418
|
|
2331
2419
|
template<typename T>
|
2332
|
-
class GKV: public GKV_Base<T> {
|
2420
|
+
class GKV : public GKV_Base<T> {
|
2333
2421
|
GKV() = delete;
|
2334
2422
|
|
2335
2423
|
public:
|
@@ -2345,46 +2433,46 @@ namespace GGUFMeta {
|
|
2345
2433
|
|
2346
2434
|
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
2347
2435
|
switch (ty) {
|
2348
|
-
case
|
2349
|
-
case
|
2350
|
-
case
|
2436
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2437
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2438
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2351
2439
|
}
|
2352
2440
|
return "unknown";
|
2353
2441
|
}
|
2354
2442
|
|
2355
|
-
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *
|
2356
|
-
if (!
|
2357
|
-
if (
|
2443
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
|
2444
|
+
if (!ovrd) { return false; }
|
2445
|
+
if (ovrd->tag == expected_type) {
|
2358
2446
|
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
2359
|
-
__func__, override_type_to_str(
|
2360
|
-
switch (
|
2361
|
-
case
|
2362
|
-
LLAMA_LOG_INFO("%s\n",
|
2447
|
+
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2448
|
+
switch (ovrd->tag) {
|
2449
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2450
|
+
LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
|
2363
2451
|
} break;
|
2364
|
-
case
|
2365
|
-
LLAMA_LOG_INFO("%" PRId64 "\n",
|
2452
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2453
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
|
2366
2454
|
} break;
|
2367
|
-
case
|
2368
|
-
LLAMA_LOG_INFO("%.6f\n",
|
2455
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2456
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
|
2369
2457
|
} break;
|
2370
2458
|
default:
|
2371
2459
|
// Shouldn't be possible to end up here, but just in case...
|
2372
2460
|
throw std::runtime_error(
|
2373
2461
|
format("Unsupported attempt to override %s type for metadata key %s\n",
|
2374
|
-
override_type_to_str(
|
2462
|
+
override_type_to_str(ovrd->tag), ovrd->key));
|
2375
2463
|
}
|
2376
2464
|
return true;
|
2377
2465
|
}
|
2378
2466
|
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
2379
|
-
__func__,
|
2467
|
+
__func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
|
2380
2468
|
return false;
|
2381
2469
|
}
|
2382
2470
|
|
2383
2471
|
template<typename OT>
|
2384
2472
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2385
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
2386
|
-
if (validate_override(
|
2387
|
-
target =
|
2473
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2474
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2475
|
+
target = ovrd->bool_value;
|
2388
2476
|
return true;
|
2389
2477
|
}
|
2390
2478
|
return false;
|
@@ -2392,9 +2480,9 @@ namespace GGUFMeta {
|
|
2392
2480
|
|
2393
2481
|
template<typename OT>
|
2394
2482
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2395
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
2396
|
-
if (validate_override(
|
2397
|
-
target =
|
2483
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2484
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2485
|
+
target = ovrd->int_value;
|
2398
2486
|
return true;
|
2399
2487
|
}
|
2400
2488
|
return false;
|
@@ -2402,9 +2490,9 @@ namespace GGUFMeta {
|
|
2402
2490
|
|
2403
2491
|
template<typename OT>
|
2404
2492
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2405
|
-
try_override(T & target, const struct llama_model_kv_override *
|
2406
|
-
if (validate_override(
|
2407
|
-
target =
|
2493
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2494
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2495
|
+
target = ovrd->float_value;
|
2408
2496
|
return true;
|
2409
2497
|
}
|
2410
2498
|
return false;
|
@@ -2412,17 +2500,17 @@ namespace GGUFMeta {
|
|
2412
2500
|
|
2413
2501
|
template<typename OT>
|
2414
2502
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2415
|
-
try_override(T & target, const struct llama_model_kv_override *
|
2503
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2416
2504
|
(void)target;
|
2417
|
-
(void)
|
2418
|
-
if (!
|
2505
|
+
(void)ovrd;
|
2506
|
+
if (!ovrd) { return false; }
|
2419
2507
|
// Currently, we should never end up here so it would be a bug if we do.
|
2420
2508
|
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
2421
|
-
|
2509
|
+
ovrd ? ovrd->key : "NULL"));
|
2422
2510
|
}
|
2423
2511
|
|
2424
|
-
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *
|
2425
|
-
if (try_override<T>(target,
|
2512
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2513
|
+
if (try_override<T>(target, ovrd)) {
|
2426
2514
|
return true;
|
2427
2515
|
}
|
2428
2516
|
if (k < 0) { return false; }
|
@@ -2430,12 +2518,12 @@ namespace GGUFMeta {
|
|
2430
2518
|
return true;
|
2431
2519
|
}
|
2432
2520
|
|
2433
|
-
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *
|
2434
|
-
return set(ctx, gguf_find_key(ctx, key), target,
|
2521
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2522
|
+
return set(ctx, gguf_find_key(ctx, key), target, ovrd);
|
2435
2523
|
}
|
2436
2524
|
|
2437
|
-
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *
|
2438
|
-
return set(ctx, key.c_str(), target,
|
2525
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2526
|
+
return set(ctx, key.c_str(), target, ovrd);
|
2439
2527
|
}
|
2440
2528
|
};
|
2441
2529
|
}
|
@@ -2542,9 +2630,12 @@ struct llama_model_loader {
|
|
2542
2630
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2543
2631
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2544
2632
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2633
|
+
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2545
2634
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2546
2635
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
2547
2636
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2637
|
+
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2638
|
+
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
2548
2639
|
default:
|
2549
2640
|
{
|
2550
2641
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2845,6 +2936,19 @@ struct llama_model_loader {
|
|
2845
2936
|
}
|
2846
2937
|
};
|
2847
2938
|
|
2939
|
+
template<>
|
2940
|
+
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
2941
|
+
uint32_t tmp;
|
2942
|
+
const bool found = get_key(kid, tmp, required);
|
2943
|
+
if (found) {
|
2944
|
+
result = (enum llama_pooling_type) tmp;
|
2945
|
+
} else {
|
2946
|
+
result = LLAMA_POOLING_TYPE_UNSPECIFIED;
|
2947
|
+
}
|
2948
|
+
return found;
|
2949
|
+
}
|
2950
|
+
|
2951
|
+
|
2848
2952
|
//
|
2849
2953
|
// load LLaMA models
|
2850
2954
|
//
|
@@ -2886,10 +2990,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2886
2990
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2887
2991
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
2888
2992
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2889
|
-
case
|
2993
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
2994
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
2995
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
2890
2996
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2891
2997
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
2892
2998
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
2999
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
3000
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
3001
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
2893
3002
|
|
2894
3003
|
default: return "unknown, may not work";
|
2895
3004
|
}
|
@@ -2923,16 +3032,16 @@ static const char * llama_model_type_name(e_model type) {
|
|
2923
3032
|
default: return "?B";
|
2924
3033
|
}
|
2925
3034
|
}
|
3035
|
+
|
2926
3036
|
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2927
3037
|
switch (type) {
|
2928
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
2929
|
-
case LLAMA_VOCAB_TYPE_BPE:
|
2930
|
-
case LLAMA_VOCAB_TYPE_WPM:
|
2931
|
-
default:
|
3038
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
3039
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
3040
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
3041
|
+
default: return "unknown";
|
2932
3042
|
}
|
2933
3043
|
}
|
2934
3044
|
|
2935
|
-
|
2936
3045
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
2937
3046
|
model.arch = ml.get_arch();
|
2938
3047
|
if (model.arch == LLM_ARCH_UNKNOWN) {
|
@@ -2996,7 +3105,7 @@ static void llm_load_hparams(
|
|
2996
3105
|
std::string rope_scaling("linear");
|
2997
3106
|
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
2998
3107
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
2999
|
-
GGML_ASSERT(hparams.rope_scaling_type_train !=
|
3108
|
+
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
3000
3109
|
|
3001
3110
|
// rope_freq_scale (inverse of the kv) is optional
|
3002
3111
|
float ropescale = 0.0f;
|
@@ -3109,10 +3218,10 @@ static void llm_load_hparams(
|
|
3109
3218
|
} break;
|
3110
3219
|
case LLM_ARCH_BERT:
|
3111
3220
|
{
|
3112
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
3113
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
3221
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3222
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3114
3223
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3115
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
3224
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
3116
3225
|
|
3117
3226
|
switch (hparams.n_layer) {
|
3118
3227
|
case 3:
|
@@ -3130,10 +3239,10 @@ static void llm_load_hparams(
|
|
3130
3239
|
} break;
|
3131
3240
|
case LLM_ARCH_NOMIC_BERT:
|
3132
3241
|
{
|
3133
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
3134
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
3242
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3243
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3135
3244
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3136
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
3245
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3137
3246
|
|
3138
3247
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
3139
3248
|
model.type = e_model::MODEL_137M;
|
@@ -3264,6 +3373,16 @@ static void llm_load_hparams(
|
|
3264
3373
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3265
3374
|
}
|
3266
3375
|
} break;
|
3376
|
+
case LLM_ARCH_STARCODER2:
|
3377
|
+
{
|
3378
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3379
|
+
switch (hparams.n_layer) {
|
3380
|
+
case 30: model.type = e_model::MODEL_3B; break;
|
3381
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3382
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
3383
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3384
|
+
}
|
3385
|
+
} break;
|
3267
3386
|
default: (void)0;
|
3268
3387
|
}
|
3269
3388
|
|
@@ -3272,6 +3391,8 @@ static void llm_load_hparams(
|
|
3272
3391
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
3273
3392
|
hparams.need_kq_pos = true;
|
3274
3393
|
}
|
3394
|
+
|
3395
|
+
hparams.rope_type = llama_rope_type(&model);
|
3275
3396
|
}
|
3276
3397
|
|
3277
3398
|
// TODO: This should probably be in llama.h
|
@@ -3574,6 +3695,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3574
3695
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3575
3696
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3576
3697
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
3698
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
3699
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
3577
3700
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
3578
3701
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
3579
3702
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -3640,7 +3763,7 @@ static bool llm_load_tensors(
|
|
3640
3763
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
3641
3764
|
}
|
3642
3765
|
|
3643
|
-
if (split_mode ==
|
3766
|
+
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
3644
3767
|
// calculate the split points
|
3645
3768
|
int device_count = llama_get_device_count();
|
3646
3769
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
@@ -3679,10 +3802,10 @@ static bool llm_load_tensors(
|
|
3679
3802
|
}
|
3680
3803
|
} else {
|
3681
3804
|
ggml_backend_buffer_type_t split_buft;
|
3682
|
-
if (split_mode ==
|
3805
|
+
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
3683
3806
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
3684
3807
|
} else {
|
3685
|
-
//
|
3808
|
+
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
3686
3809
|
split_buft = llama_default_buffer_type_offload(main_gpu);
|
3687
3810
|
}
|
3688
3811
|
// assign the repeating layers
|
@@ -4430,6 +4553,56 @@ static bool llm_load_tensors(
|
|
4430
4553
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4431
4554
|
}
|
4432
4555
|
} break;
|
4556
|
+
case LLM_ARCH_STARCODER2:
|
4557
|
+
{
|
4558
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4559
|
+
|
4560
|
+
// output
|
4561
|
+
{
|
4562
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4563
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4564
|
+
|
4565
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4566
|
+
// if output is NULL, init from the input tok embed
|
4567
|
+
if (model.output == NULL) {
|
4568
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4569
|
+
ml.n_created--; // artificial tensor
|
4570
|
+
ml.size_data += ggml_nbytes(model.output);
|
4571
|
+
}
|
4572
|
+
|
4573
|
+
}
|
4574
|
+
|
4575
|
+
for (int i = 0; i < n_layer; ++i) {
|
4576
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4577
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4578
|
+
|
4579
|
+
auto & layer = model.layers[i];
|
4580
|
+
|
4581
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4582
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4583
|
+
|
4584
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4585
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4586
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4587
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4588
|
+
|
4589
|
+
// optional bias tensors
|
4590
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
4591
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
4592
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
4593
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
4594
|
+
|
4595
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4596
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
4597
|
+
|
4598
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4599
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4600
|
+
|
4601
|
+
// optional bias tensors
|
4602
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
4603
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
|
4604
|
+
}
|
4605
|
+
} break;
|
4433
4606
|
default:
|
4434
4607
|
throw std::runtime_error("unknown architecture");
|
4435
4608
|
}
|
@@ -4595,12 +4768,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4595
4768
|
|
4596
4769
|
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
4597
4770
|
|
4598
|
-
enum llm_rope_type {
|
4599
|
-
LLM_ROPE,
|
4600
|
-
LLM_ROPE_NEOX,
|
4601
|
-
LLM_ROPE_GLM,
|
4602
|
-
};
|
4603
|
-
|
4604
4771
|
enum llm_ffn_op_type {
|
4605
4772
|
LLM_FFN_SILU,
|
4606
4773
|
LLM_FFN_GELU,
|
@@ -4646,55 +4813,6 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
4646
4813
|
return inpL;
|
4647
4814
|
}
|
4648
4815
|
|
4649
|
-
// Persimmon: n_rot = n_embd_head_k/2
|
4650
|
-
// Other: n_rot = n_embd_head_k
|
4651
|
-
static void llm_build_k_shift(
|
4652
|
-
struct ggml_context * ctx,
|
4653
|
-
const llama_hparams & hparams,
|
4654
|
-
const llama_cparams & cparams,
|
4655
|
-
const llama_kv_cache & kv,
|
4656
|
-
struct ggml_cgraph * graph,
|
4657
|
-
struct ggml_tensor * K_shift,
|
4658
|
-
llm_rope_type type,
|
4659
|
-
int64_t n_ctx,
|
4660
|
-
float freq_base,
|
4661
|
-
float freq_scale,
|
4662
|
-
const llm_build_cb & cb) {
|
4663
|
-
const int64_t n_layer = hparams.n_layer;
|
4664
|
-
const int64_t n_head_kv = hparams.n_head_kv;
|
4665
|
-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4666
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4667
|
-
const int32_t n_rot = hparams.n_rot;
|
4668
|
-
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
4669
|
-
const float ext_factor = cparams.yarn_ext_factor;
|
4670
|
-
const float attn_factor = cparams.yarn_attn_factor;
|
4671
|
-
const float beta_fast = cparams.yarn_beta_fast;
|
4672
|
-
const float beta_slow = cparams.yarn_beta_slow;
|
4673
|
-
|
4674
|
-
int rope_type = 0;
|
4675
|
-
|
4676
|
-
switch (type) {
|
4677
|
-
case LLM_ROPE: rope_type = 0; break;
|
4678
|
-
case LLM_ROPE_NEOX: rope_type = 2; break;
|
4679
|
-
case LLM_ROPE_GLM: rope_type = 4; break;
|
4680
|
-
}
|
4681
|
-
|
4682
|
-
for (int il = 0; il < n_layer; ++il) {
|
4683
|
-
struct ggml_tensor * tmp =
|
4684
|
-
// we rotate only the first n_rot dimensions
|
4685
|
-
ggml_rope_custom_inplace(ctx,
|
4686
|
-
ggml_view_3d(ctx, kv.k_l[il],
|
4687
|
-
n_embd_head_k, n_head_kv, n_ctx,
|
4688
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
4689
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
4690
|
-
0),
|
4691
|
-
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
4692
|
-
ext_factor, attn_factor, beta_fast, beta_slow);
|
4693
|
-
cb(tmp, "K_shifted", il);
|
4694
|
-
ggml_build_forward_expand(graph, tmp);
|
4695
|
-
}
|
4696
|
-
}
|
4697
|
-
|
4698
4816
|
static void llm_build_kv_store(
|
4699
4817
|
struct ggml_context * ctx,
|
4700
4818
|
const llama_hparams & hparams,
|
@@ -4896,8 +5014,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4896
5014
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4897
5015
|
}
|
4898
5016
|
|
4899
|
-
#if defined(
|
4900
|
-
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for
|
5017
|
+
#if defined(GGML_USE_KOMPUTE)
|
5018
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
4901
5019
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4902
5020
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4903
5021
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
@@ -4981,6 +5099,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
4981
5099
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
4982
5100
|
|
4983
5101
|
struct ggml_tensor * cur;
|
5102
|
+
|
4984
5103
|
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
4985
5104
|
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
4986
5105
|
cb(cur, "kqv_out", il);
|
@@ -4998,6 +5117,7 @@ struct llm_build_context {
|
|
4998
5117
|
|
4999
5118
|
const int64_t n_embd;
|
5000
5119
|
const int64_t n_layer;
|
5120
|
+
const int64_t n_rot;
|
5001
5121
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
5002
5122
|
const int64_t n_head;
|
5003
5123
|
const int64_t n_head_kv;
|
@@ -5022,8 +5142,8 @@ struct llm_build_context {
|
|
5022
5142
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5023
5143
|
const int32_t n_orig_ctx;
|
5024
5144
|
|
5025
|
-
const
|
5026
|
-
const
|
5145
|
+
const enum llama_pooling_type pooling_type;
|
5146
|
+
const enum llama_rope_type rope_type;
|
5027
5147
|
|
5028
5148
|
const llm_build_cb & cb;
|
5029
5149
|
|
@@ -5045,6 +5165,7 @@ struct llm_build_context {
|
|
5045
5165
|
kv_self (lctx.kv_self),
|
5046
5166
|
n_embd (hparams.n_embd),
|
5047
5167
|
n_layer (hparams.n_layer),
|
5168
|
+
n_rot (hparams.n_rot),
|
5048
5169
|
n_ctx (cparams.n_ctx),
|
5049
5170
|
n_head (hparams.n_head),
|
5050
5171
|
n_head_kv (hparams.n_head_kv),
|
@@ -5066,8 +5187,8 @@ struct llm_build_context {
|
|
5066
5187
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
5067
5188
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
5068
5189
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5069
|
-
|
5070
|
-
|
5190
|
+
pooling_type (cparams.pooling_type),
|
5191
|
+
rope_type (hparams.rope_type),
|
5071
5192
|
cb (cb),
|
5072
5193
|
buf_compute_meta (lctx.buf_compute_meta) {
|
5073
5194
|
// all initializations should be done in init()
|
@@ -5090,6 +5211,76 @@ struct llm_build_context {
|
|
5090
5211
|
}
|
5091
5212
|
}
|
5092
5213
|
|
5214
|
+
struct ggml_cgraph * build_k_shift() {
|
5215
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5216
|
+
|
5217
|
+
for (int il = 0; il < n_layer; ++il) {
|
5218
|
+
struct ggml_tensor * tmp =
|
5219
|
+
// we rotate only the first n_rot dimensions
|
5220
|
+
ggml_rope_custom_inplace(ctx0,
|
5221
|
+
ggml_view_3d(ctx0, kv_self.k_l[il],
|
5222
|
+
n_embd_head_k, n_head_kv, n_ctx,
|
5223
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
5224
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5225
|
+
0),
|
5226
|
+
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5227
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
5228
|
+
cb(tmp, "K_shifted", il);
|
5229
|
+
ggml_build_forward_expand(gf, tmp);
|
5230
|
+
}
|
5231
|
+
|
5232
|
+
return gf;
|
5233
|
+
}
|
5234
|
+
|
5235
|
+
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
5236
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5237
|
+
|
5238
|
+
for (uint32_t i = 0; i < ids.size(); ++i) {
|
5239
|
+
const uint32_t id = ids[i];
|
5240
|
+
|
5241
|
+
if (i == id || id == ids.size()) {
|
5242
|
+
continue;
|
5243
|
+
}
|
5244
|
+
|
5245
|
+
uint32_t nm = 1;
|
5246
|
+
|
5247
|
+
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
5248
|
+
nm++;
|
5249
|
+
}
|
5250
|
+
|
5251
|
+
for (int il = 0; il < n_layer; ++il) {
|
5252
|
+
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
5253
|
+
n_embd_k_gqa, nm,
|
5254
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5255
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
5256
|
+
|
5257
|
+
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
5258
|
+
n_embd_k_gqa, nm,
|
5259
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5260
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
5261
|
+
|
5262
|
+
ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
5263
|
+
nm, n_embd_v_gqa,
|
5264
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
5265
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
5266
|
+
|
5267
|
+
ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
5268
|
+
nm, n_embd_v_gqa,
|
5269
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
5270
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
5271
|
+
|
5272
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
5273
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
5274
|
+
}
|
5275
|
+
|
5276
|
+
i += nm - 1;
|
5277
|
+
}
|
5278
|
+
|
5279
|
+
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
5280
|
+
|
5281
|
+
return gf;
|
5282
|
+
}
|
5283
|
+
|
5093
5284
|
struct ggml_cgraph * build_llama() {
|
5094
5285
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5095
5286
|
|
@@ -5111,11 +5302,6 @@ struct llm_build_context {
|
|
5111
5302
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5112
5303
|
cb(KQ_mask, "KQ_mask", -1);
|
5113
5304
|
|
5114
|
-
// shift the entire K-cache if needed
|
5115
|
-
if (do_rope_shift) {
|
5116
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5117
|
-
}
|
5118
|
-
|
5119
5305
|
for (int il = 0; il < n_layer; ++il) {
|
5120
5306
|
struct ggml_tensor * inpSA = inpL;
|
5121
5307
|
|
@@ -5151,14 +5337,14 @@ struct llm_build_context {
|
|
5151
5337
|
|
5152
5338
|
Qcur = ggml_rope_custom(
|
5153
5339
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5154
|
-
|
5340
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5155
5341
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5156
5342
|
);
|
5157
5343
|
cb(Qcur, "Qcur", il);
|
5158
5344
|
|
5159
5345
|
Kcur = ggml_rope_custom(
|
5160
5346
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5161
|
-
|
5347
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5162
5348
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5163
5349
|
);
|
5164
5350
|
cb(Kcur, "Kcur", il);
|
@@ -5299,11 +5485,6 @@ struct llm_build_context {
|
|
5299
5485
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5300
5486
|
cb(KQ_pos, "KQ_pos", -1);
|
5301
5487
|
|
5302
|
-
// shift the entire K-cache if needed
|
5303
|
-
if (do_rope_shift) {
|
5304
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5305
|
-
}
|
5306
|
-
|
5307
5488
|
for (int il = 0; il < n_layer; ++il) {
|
5308
5489
|
struct ggml_tensor * inpSA = inpL;
|
5309
5490
|
|
@@ -5327,12 +5508,12 @@ struct llm_build_context {
|
|
5327
5508
|
case MODEL_7B:
|
5328
5509
|
Qcur = ggml_rope_custom(
|
5329
5510
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5330
|
-
|
5511
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5331
5512
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5332
5513
|
);
|
5333
5514
|
Kcur = ggml_rope_custom(
|
5334
5515
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5335
|
-
|
5516
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5336
5517
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5337
5518
|
);
|
5338
5519
|
break;
|
@@ -5417,11 +5598,6 @@ struct llm_build_context {
|
|
5417
5598
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5418
5599
|
cb(KQ_mask, "KQ_mask", -1);
|
5419
5600
|
|
5420
|
-
// shift the entire K-cache if needed
|
5421
|
-
if (do_rope_shift) {
|
5422
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5423
|
-
}
|
5424
|
-
|
5425
5601
|
for (int il = 0; il < n_layer; ++il) {
|
5426
5602
|
struct ggml_tensor * attn_norm;
|
5427
5603
|
|
@@ -5460,13 +5636,13 @@ struct llm_build_context {
|
|
5460
5636
|
|
5461
5637
|
// using mode = 2 for neox mode
|
5462
5638
|
Qcur = ggml_rope_custom(
|
5463
|
-
ctx0, Qcur, inp_pos,
|
5639
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5464
5640
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5465
5641
|
);
|
5466
5642
|
cb(Qcur, "Qcur", il);
|
5467
5643
|
|
5468
5644
|
Kcur = ggml_rope_custom(
|
5469
|
-
ctx0, Kcur, inp_pos,
|
5645
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5470
5646
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5471
5647
|
);
|
5472
5648
|
cb(Kcur, "Kcur", il);
|
@@ -5636,10 +5812,6 @@ struct llm_build_context {
|
|
5636
5812
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5637
5813
|
cb(KQ_mask, "KQ_mask", -1);
|
5638
5814
|
|
5639
|
-
if (do_rope_shift) {
|
5640
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5641
|
-
}
|
5642
|
-
|
5643
5815
|
for (int il = 0; il < n_layer; ++il) {
|
5644
5816
|
struct ggml_tensor * residual = inpL;
|
5645
5817
|
|
@@ -5697,7 +5869,7 @@ struct llm_build_context {
|
|
5697
5869
|
|
5698
5870
|
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
5699
5871
|
struct ggml_tensor * qrot = ggml_view_3d(
|
5700
|
-
ctx0, tmpq,
|
5872
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5701
5873
|
ggml_element_size(tmpq) * n_embd_head,
|
5702
5874
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5703
5875
|
0
|
@@ -5705,7 +5877,7 @@ struct llm_build_context {
|
|
5705
5877
|
cb(qrot, "qrot", il);
|
5706
5878
|
|
5707
5879
|
struct ggml_tensor * krot = ggml_view_3d(
|
5708
|
-
ctx0, tmpk,
|
5880
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5709
5881
|
ggml_element_size(tmpk) * n_embd_head,
|
5710
5882
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5711
5883
|
0
|
@@ -5714,29 +5886,29 @@ struct llm_build_context {
|
|
5714
5886
|
|
5715
5887
|
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
5716
5888
|
struct ggml_tensor * qpass = ggml_view_3d(
|
5717
|
-
ctx0, tmpq,
|
5889
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5718
5890
|
ggml_element_size(tmpq) * n_embd_head,
|
5719
5891
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5720
|
-
ggml_element_size(tmpq) *
|
5892
|
+
ggml_element_size(tmpq) * n_rot
|
5721
5893
|
);
|
5722
5894
|
cb(qpass, "qpass", il);
|
5723
5895
|
|
5724
5896
|
struct ggml_tensor * kpass = ggml_view_3d(
|
5725
|
-
ctx0, tmpk,
|
5897
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5726
5898
|
ggml_element_size(tmpk) * n_embd_head,
|
5727
5899
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5728
|
-
ggml_element_size(tmpk) *
|
5900
|
+
ggml_element_size(tmpk) * n_rot
|
5729
5901
|
);
|
5730
5902
|
cb(kpass, "kpass", il);
|
5731
5903
|
|
5732
5904
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
5733
|
-
ctx0, qrot, inp_pos,
|
5905
|
+
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5734
5906
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5735
5907
|
);
|
5736
5908
|
cb(qrotated, "qrotated", il);
|
5737
5909
|
|
5738
5910
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
5739
|
-
ctx0, krot, inp_pos,
|
5911
|
+
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5740
5912
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5741
5913
|
);
|
5742
5914
|
cb(krotated, "krotated", il);
|
@@ -5921,6 +6093,7 @@ struct llm_build_context {
|
|
5921
6093
|
|
5922
6094
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5923
6095
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
6096
|
+
|
5924
6097
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5925
6098
|
|
5926
6099
|
struct ggml_tensor * cur;
|
@@ -5928,9 +6101,10 @@ struct llm_build_context {
|
|
5928
6101
|
|
5929
6102
|
// get input vectors with right size
|
5930
6103
|
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
5931
|
-
|
6104
|
+
|
6105
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
5932
6106
|
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
5933
|
-
struct ggml_tensor * inp_cls
|
6107
|
+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
5934
6108
|
|
5935
6109
|
// construct input embeddings (token, type, position)
|
5936
6110
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
@@ -5948,39 +6122,38 @@ struct llm_build_context {
|
|
5948
6122
|
cb(inpL, "inp_norm", -1);
|
5949
6123
|
|
5950
6124
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5951
|
-
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask,
|
5952
|
-
cb(KQ_mask, "KQ_mask", -1); // [
|
6125
|
+
struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
|
6126
|
+
cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
|
5953
6127
|
|
5954
6128
|
// iterate layers
|
5955
6129
|
for (int il = 0; il < n_layer; ++il) {
|
5956
6130
|
struct ggml_tensor * cur = inpL;
|
5957
6131
|
|
6132
|
+
struct ggml_tensor * Qcur;
|
6133
|
+
struct ggml_tensor * Kcur;
|
6134
|
+
struct ggml_tensor * Vcur;
|
6135
|
+
|
5958
6136
|
// self-attention
|
5959
6137
|
if (model.arch == LLM_ARCH_BERT) {
|
5960
|
-
|
6138
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
5961
6139
|
cb(Qcur, "Qcur", il);
|
5962
6140
|
|
5963
|
-
|
6141
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
5964
6142
|
cb(Kcur, "Kcur", il);
|
5965
6143
|
|
5966
|
-
|
6144
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
5967
6145
|
cb(Vcur, "Vcur", il);
|
5968
6146
|
|
5969
|
-
|
5970
|
-
|
5971
|
-
|
5972
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5973
|
-
model.layers[il].wo, model.layers[il].bo,
|
5974
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5975
|
-
cb(cur, "kqv_out", il);
|
6147
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
6148
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5976
6149
|
} else {
|
5977
6150
|
// compute Q and K and RoPE them
|
5978
6151
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5979
6152
|
cb(cur, "wqkv", il);
|
5980
6153
|
|
5981
|
-
|
5982
|
-
|
5983
|
-
|
6154
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
6155
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
6156
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5984
6157
|
|
5985
6158
|
cb(Qcur, "Qcur", il);
|
5986
6159
|
cb(Kcur, "Kcur", il);
|
@@ -5988,24 +6161,52 @@ struct llm_build_context {
|
|
5988
6161
|
|
5989
6162
|
Qcur = ggml_rope_custom(
|
5990
6163
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5991
|
-
|
6164
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5992
6165
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5993
6166
|
);
|
5994
6167
|
cb(Qcur, "Qcur", il);
|
5995
6168
|
|
5996
6169
|
Kcur = ggml_rope_custom(
|
5997
6170
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5998
|
-
|
6171
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5999
6172
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6000
6173
|
);
|
6001
6174
|
cb(Kcur, "Kcur", il);
|
6175
|
+
}
|
6002
6176
|
|
6003
|
-
|
6004
|
-
|
6005
|
-
|
6006
|
-
|
6177
|
+
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
6178
|
+
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
6179
|
+
|
6180
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
6181
|
+
cb(kq, "kq", il);
|
6182
|
+
|
6183
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
6184
|
+
cb(kq, "kq_soft_max_ext", il);
|
6185
|
+
|
6186
|
+
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
6187
|
+
cb(v, "v", il);
|
6188
|
+
|
6189
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
|
6190
|
+
cb(kqv, "kqv", il);
|
6191
|
+
|
6192
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
6193
|
+
cb(kqv_merged, "kqv_merged", il);
|
6194
|
+
|
6195
|
+
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
6196
|
+
cb(cur, "kqv_merged_cont", il);
|
6197
|
+
|
6198
|
+
ggml_build_forward_expand(gf, cur);
|
6199
|
+
|
6200
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
6201
|
+
if (model.layers[il].bo) {
|
6202
|
+
cb(cur, "kqv_wo", il);
|
6007
6203
|
}
|
6008
6204
|
|
6205
|
+
if (model.layers[il].bo) {
|
6206
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
6207
|
+
}
|
6208
|
+
cb(cur, "kqv_out", il);
|
6209
|
+
|
6009
6210
|
// re-add the layer input
|
6010
6211
|
cur = ggml_add(ctx0, cur, inpL);
|
6011
6212
|
|
@@ -6045,16 +6246,29 @@ struct llm_build_context {
|
|
6045
6246
|
|
6046
6247
|
// final output
|
6047
6248
|
cur = inpL;
|
6249
|
+
cb(cur, "result_embd", -1);
|
6048
6250
|
|
6049
6251
|
// pooling layer
|
6050
|
-
|
6051
|
-
|
6052
|
-
|
6053
|
-
|
6054
|
-
|
6055
|
-
|
6252
|
+
switch (pooling_type) {
|
6253
|
+
case LLAMA_POOLING_TYPE_NONE:
|
6254
|
+
{
|
6255
|
+
// nop
|
6256
|
+
} break;
|
6257
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
6258
|
+
{
|
6259
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6260
|
+
cb(cur, "result_embd_pooled", -1);
|
6261
|
+
} break;
|
6262
|
+
case LLAMA_POOLING_TYPE_CLS:
|
6263
|
+
{
|
6264
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6265
|
+
cb(cur, "result_embd_pooled", -1);
|
6266
|
+
} break;
|
6267
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
6268
|
+
{
|
6269
|
+
GGML_ASSERT(false && "Invalid pooling type");
|
6270
|
+
} break;
|
6056
6271
|
}
|
6057
|
-
cb(cur, "result_embd", -1);
|
6058
6272
|
|
6059
6273
|
ggml_build_forward_expand(gf, cur);
|
6060
6274
|
|
@@ -6284,11 +6498,6 @@ struct llm_build_context {
|
|
6284
6498
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6285
6499
|
cb(KQ_mask, "KQ_mask", -1);
|
6286
6500
|
|
6287
|
-
// shift the entire K-cache if needed
|
6288
|
-
if (do_rope_shift) {
|
6289
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6290
|
-
}
|
6291
|
-
|
6292
6501
|
for (int il = 0; il < n_layer; ++il) {
|
6293
6502
|
struct ggml_tensor * inpSA = inpL;
|
6294
6503
|
|
@@ -6325,14 +6534,14 @@ struct llm_build_context {
|
|
6325
6534
|
|
6326
6535
|
Qcur = ggml_rope_custom(
|
6327
6536
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6328
|
-
|
6537
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6329
6538
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6330
6539
|
);
|
6331
6540
|
cb(Qcur, "Qcur", il);
|
6332
6541
|
|
6333
6542
|
Kcur = ggml_rope_custom(
|
6334
6543
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6335
|
-
|
6544
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6336
6545
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6337
6546
|
);
|
6338
6547
|
cb(Kcur, "Kcur", il);
|
@@ -6407,11 +6616,6 @@ struct llm_build_context {
|
|
6407
6616
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6408
6617
|
cb(KQ_mask, "KQ_mask", -1);
|
6409
6618
|
|
6410
|
-
// shift the entire K-cache if needed
|
6411
|
-
if (do_rope_shift) {
|
6412
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6413
|
-
}
|
6414
|
-
|
6415
6619
|
for (int il = 0; il < n_layer; ++il) {
|
6416
6620
|
struct ggml_tensor * inpSA = inpL;
|
6417
6621
|
|
@@ -6441,13 +6645,13 @@ struct llm_build_context {
|
|
6441
6645
|
|
6442
6646
|
// using mode = 2 for neox mode
|
6443
6647
|
Qcur = ggml_rope_custom(
|
6444
|
-
ctx0, Qcur, inp_pos,
|
6648
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6445
6649
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6446
6650
|
);
|
6447
6651
|
cb(Qcur, "Qcur", il);
|
6448
6652
|
|
6449
6653
|
Kcur = ggml_rope_custom(
|
6450
|
-
ctx0, Kcur, inp_pos,
|
6654
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6451
6655
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6452
6656
|
);
|
6453
6657
|
cb(Kcur, "Kcur", il);
|
@@ -6521,11 +6725,6 @@ struct llm_build_context {
|
|
6521
6725
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6522
6726
|
cb(KQ_mask, "KQ_mask", -1);
|
6523
6727
|
|
6524
|
-
// shift the entire K-cache if needed
|
6525
|
-
if (do_rope_shift) {
|
6526
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6527
|
-
}
|
6528
|
-
|
6529
6728
|
for (int il = 0; il < n_layer; ++il) {
|
6530
6729
|
struct ggml_tensor * inpSA = inpL;
|
6531
6730
|
|
@@ -6561,14 +6760,14 @@ struct llm_build_context {
|
|
6561
6760
|
|
6562
6761
|
Qcur = ggml_rope_custom(
|
6563
6762
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6564
|
-
|
6763
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6565
6764
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6566
6765
|
);
|
6567
6766
|
cb(Qcur, "Qcur", il);
|
6568
6767
|
|
6569
6768
|
Kcur = ggml_rope_custom(
|
6570
6769
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6571
|
-
|
6770
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6572
6771
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6573
6772
|
);
|
6574
6773
|
cb(Kcur, "Kcur", il);
|
@@ -6642,11 +6841,6 @@ struct llm_build_context {
|
|
6642
6841
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6643
6842
|
cb(KQ_mask, "KQ_mask", -1);
|
6644
6843
|
|
6645
|
-
// shift the entire K-cache if needed
|
6646
|
-
if (do_rope_shift) {
|
6647
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6648
|
-
}
|
6649
|
-
|
6650
6844
|
for (int il = 0; il < n_layer; ++il) {
|
6651
6845
|
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
6652
6846
|
model.layers[il].attn_norm,
|
@@ -6684,7 +6878,7 @@ struct llm_build_context {
|
|
6684
6878
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
6685
6879
|
|
6686
6880
|
Qcur = ggml_rope_custom(
|
6687
|
-
ctx0, Qcur, inp_pos,
|
6881
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6688
6882
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6689
6883
|
);
|
6690
6884
|
cb(Qcur, "Qcur", il);
|
@@ -6695,7 +6889,7 @@ struct llm_build_context {
|
|
6695
6889
|
cb(Qcur, "Qcur", il);
|
6696
6890
|
|
6697
6891
|
Kcur = ggml_rope_custom(
|
6698
|
-
ctx0, Kcur, inp_pos,
|
6892
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6699
6893
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6700
6894
|
);
|
6701
6895
|
cb(Kcur, "Kcur", il);
|
@@ -6764,11 +6958,6 @@ struct llm_build_context {
|
|
6764
6958
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6765
6959
|
cb(KQ_mask, "KQ_mask", -1);
|
6766
6960
|
|
6767
|
-
// shift the entire K-cache if needed
|
6768
|
-
if (do_rope_shift) {
|
6769
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6770
|
-
}
|
6771
|
-
|
6772
6961
|
for (int il = 0; il < n_layer; ++il) {
|
6773
6962
|
|
6774
6963
|
// norm
|
@@ -6792,14 +6981,14 @@ struct llm_build_context {
|
|
6792
6981
|
cb(Vcur, "Vcur", il);
|
6793
6982
|
|
6794
6983
|
Qcur = ggml_rope_custom(
|
6795
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur,
|
6796
|
-
n_embd_head,
|
6984
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
6985
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6797
6986
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
6798
6987
|
cb(Qcur, "Qcur", il);
|
6799
6988
|
|
6800
6989
|
Kcur = ggml_rope_custom(
|
6801
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur,
|
6802
|
-
n_embd_head,
|
6990
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
6991
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6803
6992
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
6804
6993
|
cb(Kcur, "Kcur", il);
|
6805
6994
|
|
@@ -6969,11 +7158,6 @@ struct llm_build_context {
|
|
6969
7158
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6970
7159
|
cb(KQ_mask, "KQ_mask", -1);
|
6971
7160
|
|
6972
|
-
// shift the entire K-cache if needed
|
6973
|
-
if (do_rope_shift) {
|
6974
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6975
|
-
}
|
6976
|
-
|
6977
7161
|
for (int il = 0; il < n_layer; ++il) {
|
6978
7162
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
6979
7163
|
model.layers[il].attn_norm,
|
@@ -6999,14 +7183,14 @@ struct llm_build_context {
|
|
6999
7183
|
|
7000
7184
|
struct ggml_tensor * Qcur = ggml_rope_custom(
|
7001
7185
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
7002
|
-
|
7186
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7003
7187
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7004
7188
|
);
|
7005
7189
|
cb(Qcur, "Qcur", il);
|
7006
7190
|
|
7007
7191
|
struct ggml_tensor * Kcur = ggml_rope_custom(
|
7008
7192
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7009
|
-
|
7193
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7010
7194
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7011
7195
|
);
|
7012
7196
|
cb(Kcur, "Kcur", il);
|
@@ -7077,11 +7261,6 @@ struct llm_build_context {
|
|
7077
7261
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7078
7262
|
cb(KQ_mask, "KQ_mask", -1);
|
7079
7263
|
|
7080
|
-
// shift the entire K-cache if needed
|
7081
|
-
if (do_rope_shift) {
|
7082
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7083
|
-
}
|
7084
|
-
|
7085
7264
|
for (int il = 0; il < n_layer; ++il) {
|
7086
7265
|
struct ggml_tensor * inpSA = inpL;
|
7087
7266
|
|
@@ -7117,14 +7296,14 @@ struct llm_build_context {
|
|
7117
7296
|
|
7118
7297
|
Qcur = ggml_rope_custom(
|
7119
7298
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7120
|
-
|
7299
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7121
7300
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7122
7301
|
);
|
7123
7302
|
cb(Qcur, "Qcur", il);
|
7124
7303
|
|
7125
7304
|
Kcur = ggml_rope_custom(
|
7126
7305
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7127
|
-
|
7306
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7128
7307
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7129
7308
|
);
|
7130
7309
|
cb(Kcur, "Kcur", il);
|
@@ -7196,11 +7375,6 @@ struct llm_build_context {
|
|
7196
7375
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7197
7376
|
cb(KQ_mask, "KQ_mask", -1);
|
7198
7377
|
|
7199
|
-
// shift the entire K-cache if needed
|
7200
|
-
if (do_rope_shift) {
|
7201
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7202
|
-
}
|
7203
|
-
|
7204
7378
|
for (int il = 0; il < n_layer; ++il) {
|
7205
7379
|
struct ggml_tensor * inpSA = inpL;
|
7206
7380
|
|
@@ -7236,14 +7410,14 @@ struct llm_build_context {
|
|
7236
7410
|
|
7237
7411
|
Qcur = ggml_rope_custom(
|
7238
7412
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7239
|
-
|
7413
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7240
7414
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7241
7415
|
);
|
7242
7416
|
cb(Qcur, "Qcur", il);
|
7243
7417
|
|
7244
7418
|
Kcur = ggml_rope_custom(
|
7245
7419
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7246
|
-
|
7420
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7247
7421
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7248
7422
|
);
|
7249
7423
|
cb(Kcur, "Kcur", il);
|
@@ -7328,11 +7502,6 @@ struct llm_build_context {
|
|
7328
7502
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7329
7503
|
cb(KQ_mask, "KQ_mask", -1);
|
7330
7504
|
|
7331
|
-
// shift the entire K-cache if needed
|
7332
|
-
if (do_rope_shift) {
|
7333
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7334
|
-
}
|
7335
|
-
|
7336
7505
|
for (int il = 0; il < n_layer; ++il) {
|
7337
7506
|
struct ggml_tensor * inpSA = inpL;
|
7338
7507
|
|
@@ -7368,14 +7537,14 @@ struct llm_build_context {
|
|
7368
7537
|
|
7369
7538
|
Qcur = ggml_rope_custom(
|
7370
7539
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7371
|
-
|
7540
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7372
7541
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7373
7542
|
);
|
7374
7543
|
cb(Qcur, "Qcur", il);
|
7375
7544
|
|
7376
7545
|
Kcur = ggml_rope_custom(
|
7377
7546
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7378
|
-
|
7547
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7379
7548
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7380
7549
|
);
|
7381
7550
|
cb(Kcur, "Kcur", il);
|
@@ -7464,11 +7633,6 @@ struct llm_build_context {
|
|
7464
7633
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7465
7634
|
cb(KQ_mask, "KQ_mask", -1);
|
7466
7635
|
|
7467
|
-
// shift the entire K-cache if needed
|
7468
|
-
if (do_rope_shift) {
|
7469
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7470
|
-
}
|
7471
|
-
|
7472
7636
|
for (int il = 0; il < n_layer; ++il) {
|
7473
7637
|
|
7474
7638
|
// norm
|
@@ -7491,7 +7655,7 @@ struct llm_build_context {
|
|
7491
7655
|
|
7492
7656
|
Qcur = ggml_rope_custom(
|
7493
7657
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
7494
|
-
n_embd_head_k,
|
7658
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7495
7659
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7496
7660
|
cb(Qcur, "Qcur", il);
|
7497
7661
|
|
@@ -7500,7 +7664,7 @@ struct llm_build_context {
|
|
7500
7664
|
|
7501
7665
|
Kcur = ggml_rope_custom(
|
7502
7666
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
7503
|
-
n_embd_head_k,
|
7667
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7504
7668
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7505
7669
|
cb(Kcur, "Kcur", il);
|
7506
7670
|
|
@@ -7551,33 +7715,181 @@ struct llm_build_context {
|
|
7551
7715
|
|
7552
7716
|
return gf;
|
7553
7717
|
}
|
7554
|
-
};
|
7555
|
-
|
7556
|
-
static struct ggml_cgraph * llama_build_graph(
|
7557
|
-
llama_context & lctx,
|
7558
|
-
const llama_batch & batch,
|
7559
|
-
bool worst_case) {
|
7560
|
-
const auto & model = lctx.model;
|
7561
7718
|
|
7562
|
-
|
7563
|
-
|
7564
|
-
if (il >= 0) {
|
7565
|
-
ggml_format_name(cur, "%s-%d", name, il);
|
7566
|
-
} else {
|
7567
|
-
ggml_set_name(cur, name);
|
7568
|
-
}
|
7719
|
+
struct ggml_cgraph * build_starcoder2() {
|
7720
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7569
7721
|
|
7570
|
-
|
7571
|
-
|
7572
|
-
|
7573
|
-
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
7574
|
-
}
|
7575
|
-
}
|
7576
|
-
};
|
7722
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7723
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7724
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
7577
7725
|
|
7578
|
-
|
7726
|
+
struct ggml_tensor * cur;
|
7727
|
+
struct ggml_tensor * inpL;
|
7579
7728
|
|
7580
|
-
|
7729
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
7730
|
+
cb(inpL, "inp_embd", -1);
|
7731
|
+
|
7732
|
+
// inp_pos - contains the positions
|
7733
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
7734
|
+
cb(inp_pos, "inp_pos", -1);
|
7735
|
+
|
7736
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7737
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7738
|
+
cb(KQ_mask, "KQ_mask", -1);
|
7739
|
+
|
7740
|
+
for (int il = 0; il < n_layer; ++il) {
|
7741
|
+
struct ggml_tensor * inpSA = inpL;
|
7742
|
+
|
7743
|
+
// norm
|
7744
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7745
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
7746
|
+
LLM_NORM, cb, il);
|
7747
|
+
cb(cur, "attn_norm", il);
|
7748
|
+
|
7749
|
+
// self-attention
|
7750
|
+
{
|
7751
|
+
// compute Q and K and RoPE them
|
7752
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
7753
|
+
cb(Qcur, "Qcur", il);
|
7754
|
+
if (model.layers[il].bq) {
|
7755
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
7756
|
+
cb(Qcur, "Qcur", il);
|
7757
|
+
}
|
7758
|
+
|
7759
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
7760
|
+
cb(Kcur, "Kcur", il);
|
7761
|
+
if (model.layers[il].bk) {
|
7762
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
7763
|
+
cb(Kcur, "Kcur", il);
|
7764
|
+
}
|
7765
|
+
|
7766
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7767
|
+
cb(Vcur, "Vcur", il);
|
7768
|
+
if (model.layers[il].bv) {
|
7769
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
7770
|
+
cb(Vcur, "Vcur", il);
|
7771
|
+
}
|
7772
|
+
|
7773
|
+
Qcur = ggml_rope_custom(
|
7774
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7775
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7776
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7777
|
+
);
|
7778
|
+
cb(Qcur, "Qcur", il);
|
7779
|
+
|
7780
|
+
Kcur = ggml_rope_custom(
|
7781
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7782
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7783
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7784
|
+
);
|
7785
|
+
cb(Kcur, "Kcur", il);
|
7786
|
+
|
7787
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7788
|
+
model.layers[il].wo, model.layers[il].bo,
|
7789
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7790
|
+
cb(cur, "kqv_out", il);
|
7791
|
+
}
|
7792
|
+
|
7793
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7794
|
+
cb(ffn_inp, "ffn_inp", il);
|
7795
|
+
|
7796
|
+
// feed-forward network
|
7797
|
+
|
7798
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7799
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
7800
|
+
LLM_NORM, cb, il);
|
7801
|
+
cb(cur, "ffn_norm", il);
|
7802
|
+
|
7803
|
+
cur = llm_build_ffn(ctx0, cur,
|
7804
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
7805
|
+
NULL, NULL,
|
7806
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
7807
|
+
NULL,
|
7808
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
7809
|
+
cb(cur, "ffn_out", il);
|
7810
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7811
|
+
cb(cur, "l_out", il);
|
7812
|
+
|
7813
|
+
// input for next layer
|
7814
|
+
inpL = cur;
|
7815
|
+
}
|
7816
|
+
|
7817
|
+
cur = inpL;
|
7818
|
+
|
7819
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7820
|
+
model.output_norm, model.output_norm_b,
|
7821
|
+
LLM_NORM, cb, -1);
|
7822
|
+
cb(cur, "result_norm", -1);
|
7823
|
+
|
7824
|
+
// lm_head
|
7825
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7826
|
+
cb(cur, "result_output", -1);
|
7827
|
+
|
7828
|
+
ggml_build_forward_expand(gf, cur);
|
7829
|
+
|
7830
|
+
return gf;
|
7831
|
+
}
|
7832
|
+
};
|
7833
|
+
|
7834
|
+
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
7835
|
+
llama_batch dummy;
|
7836
|
+
dummy.n_tokens = 0;
|
7837
|
+
|
7838
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
7839
|
+
|
7840
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
7841
|
+
|
7842
|
+
llm.init();
|
7843
|
+
|
7844
|
+
struct ggml_cgraph * result = llm.build_defrag(ids);
|
7845
|
+
|
7846
|
+
llm.free();
|
7847
|
+
|
7848
|
+
return result;
|
7849
|
+
}
|
7850
|
+
|
7851
|
+
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
7852
|
+
llama_batch dummy;
|
7853
|
+
dummy.n_tokens = 0;
|
7854
|
+
|
7855
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
7856
|
+
|
7857
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
7858
|
+
|
7859
|
+
llm.init();
|
7860
|
+
|
7861
|
+
struct ggml_cgraph * result = llm.build_k_shift();
|
7862
|
+
|
7863
|
+
llm.free();
|
7864
|
+
|
7865
|
+
return result;
|
7866
|
+
}
|
7867
|
+
|
7868
|
+
static struct ggml_cgraph * llama_build_graph(
|
7869
|
+
llama_context & lctx,
|
7870
|
+
const llama_batch & batch,
|
7871
|
+
bool worst_case) {
|
7872
|
+
const auto & model = lctx.model;
|
7873
|
+
|
7874
|
+
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
7875
|
+
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
7876
|
+
if (il >= 0) {
|
7877
|
+
ggml_format_name(cur, "%s-%d", name, il);
|
7878
|
+
} else {
|
7879
|
+
ggml_set_name(cur, name);
|
7880
|
+
}
|
7881
|
+
|
7882
|
+
if (!lctx.cparams.offload_kqv) {
|
7883
|
+
if (strcmp(name, "kqv_merged_cont") == 0) {
|
7884
|
+
// all nodes between the KV store and the attention output are run on the CPU
|
7885
|
+
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
7886
|
+
}
|
7887
|
+
}
|
7888
|
+
};
|
7889
|
+
|
7890
|
+
struct ggml_cgraph * result = NULL;
|
7891
|
+
|
7892
|
+
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
7581
7893
|
|
7582
7894
|
llm.init();
|
7583
7895
|
|
@@ -7663,6 +7975,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7663
7975
|
{
|
7664
7976
|
result = llm.build_gemma();
|
7665
7977
|
} break;
|
7978
|
+
case LLM_ARCH_STARCODER2:
|
7979
|
+
{
|
7980
|
+
result = llm.build_starcoder2();
|
7981
|
+
} break;
|
7666
7982
|
default:
|
7667
7983
|
GGML_ASSERT(false);
|
7668
7984
|
}
|
@@ -7672,6 +7988,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7672
7988
|
return result;
|
7673
7989
|
}
|
7674
7990
|
|
7991
|
+
static void llama_set_k_shift(llama_context & lctx) {
|
7992
|
+
const auto & cparams = lctx.cparams;
|
7993
|
+
|
7994
|
+
const int64_t n_ctx = cparams.n_ctx;
|
7995
|
+
|
7996
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7997
|
+
|
7998
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7999
|
+
|
8000
|
+
for (int i = 0; i < n_ctx; ++i) {
|
8001
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
8002
|
+
}
|
8003
|
+
}
|
8004
|
+
|
7675
8005
|
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7676
8006
|
//
|
7677
8007
|
// set input data
|
@@ -7700,7 +8030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7700
8030
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7701
8031
|
}
|
7702
8032
|
|
7703
|
-
{
|
8033
|
+
if (hparams.causal_attn) {
|
7704
8034
|
const int64_t n_kv = kv_self.n;
|
7705
8035
|
const int64_t n_tokens = batch.n_tokens;
|
7706
8036
|
|
@@ -7715,16 +8045,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7715
8045
|
|
7716
8046
|
for (int i = 0; i < n_kv; ++i) {
|
7717
8047
|
float f;
|
7718
|
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
7719
|
-
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
8048
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
7720
8049
|
f = -INFINITY;
|
7721
8050
|
} else {
|
7722
|
-
f = 0;
|
8051
|
+
f = 0.0f;
|
7723
8052
|
}
|
7724
8053
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7725
8054
|
}
|
7726
8055
|
}
|
7727
8056
|
}
|
8057
|
+
} else {
|
8058
|
+
// non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
|
8059
|
+
const int64_t n_tokens = batch.n_tokens;
|
8060
|
+
|
8061
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
8062
|
+
|
8063
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
8064
|
+
|
8065
|
+
for (int h = 0; h < 1; ++h) {
|
8066
|
+
for (int j = 0; j < n_tokens; ++j) {
|
8067
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
8068
|
+
|
8069
|
+
for (int i = 0; i < n_tokens; ++i) {
|
8070
|
+
float f = -INFINITY;
|
8071
|
+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
8072
|
+
if (batch.seq_id[i][s] == seq_id) {
|
8073
|
+
f = 0.0f;
|
8074
|
+
break;
|
8075
|
+
}
|
8076
|
+
}
|
8077
|
+
|
8078
|
+
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
|
8079
|
+
}
|
8080
|
+
}
|
8081
|
+
}
|
7728
8082
|
}
|
7729
8083
|
|
7730
8084
|
if (hparams.need_kq_pos) {
|
@@ -7739,29 +8093,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7739
8093
|
}
|
7740
8094
|
}
|
7741
8095
|
|
7742
|
-
if (
|
7743
|
-
const int64_t n_ctx = cparams.n_ctx;
|
7744
|
-
|
7745
|
-
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7746
|
-
|
7747
|
-
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7748
|
-
|
7749
|
-
for (int i = 0; i < n_ctx; ++i) {
|
7750
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
7751
|
-
}
|
7752
|
-
}
|
7753
|
-
|
7754
|
-
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
8096
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
7755
8097
|
const int64_t n_tokens = batch.n_tokens;
|
7756
8098
|
|
7757
8099
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
7758
|
-
float * data = (float *) lctx.inp_mean->data;
|
7759
8100
|
|
8101
|
+
float * data = (float *) lctx.inp_mean->data;
|
7760
8102
|
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
7761
8103
|
|
7762
8104
|
std::vector<uint64_t> sum(n_tokens, 0);
|
7763
8105
|
for (int i = 0; i < n_tokens; ++i) {
|
7764
8106
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
8107
|
+
|
8108
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
8109
|
+
|
7765
8110
|
sum[seq_id] += 1;
|
7766
8111
|
}
|
7767
8112
|
|
@@ -7779,15 +8124,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7779
8124
|
}
|
7780
8125
|
}
|
7781
8126
|
|
7782
|
-
if (cparams.
|
8127
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
7783
8128
|
const int64_t n_tokens = batch.n_tokens;
|
7784
8129
|
|
7785
8130
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
8131
|
+
|
7786
8132
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
8133
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
7787
8134
|
|
7788
8135
|
for (int i = 0; i < n_tokens; ++i) {
|
7789
8136
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7790
|
-
const llama_pos
|
8137
|
+
const llama_pos pos = batch.pos[i];
|
8138
|
+
|
8139
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
8140
|
+
|
7791
8141
|
if (pos == 0) {
|
7792
8142
|
data[seq_id] = i;
|
7793
8143
|
}
|
@@ -7795,6 +8145,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7795
8145
|
}
|
7796
8146
|
}
|
7797
8147
|
|
8148
|
+
static void llama_graph_compute(
|
8149
|
+
llama_context & lctx,
|
8150
|
+
ggml_cgraph * gf,
|
8151
|
+
int n_threads) {
|
8152
|
+
#ifdef GGML_USE_MPI
|
8153
|
+
const int64_t n_layer = lctx.model.hparams.n_layer;
|
8154
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
8155
|
+
#endif
|
8156
|
+
|
8157
|
+
#ifdef GGML_USE_METAL
|
8158
|
+
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
8159
|
+
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
8160
|
+
}
|
8161
|
+
#endif
|
8162
|
+
|
8163
|
+
if (lctx.backend_cpu != nullptr) {
|
8164
|
+
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
8165
|
+
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
8166
|
+
}
|
8167
|
+
|
8168
|
+
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
8169
|
+
|
8170
|
+
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
8171
|
+
|
8172
|
+
#ifdef GGML_USE_MPI
|
8173
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
8174
|
+
#endif
|
8175
|
+
}
|
8176
|
+
|
7798
8177
|
// decode a batch of tokens by evaluating the transformer
|
7799
8178
|
//
|
7800
8179
|
// - lctx: llama context
|
@@ -7821,9 +8200,9 @@ static int llama_decode_internal(
|
|
7821
8200
|
const auto n_batch = cparams.n_batch;
|
7822
8201
|
|
7823
8202
|
GGML_ASSERT(n_tokens <= n_batch);
|
8203
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
7824
8204
|
|
7825
8205
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
7826
|
-
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
7827
8206
|
|
7828
8207
|
const int64_t t_start_us = ggml_time_us();
|
7829
8208
|
|
@@ -7872,21 +8251,26 @@ static int llama_decode_internal(
|
|
7872
8251
|
batch.seq_id = seq_id_arr.data();
|
7873
8252
|
}
|
7874
8253
|
|
7875
|
-
//
|
7876
|
-
|
7877
|
-
|
7878
|
-
kv_self.head = 0;
|
7879
|
-
}
|
8254
|
+
// non-causal masks do not use the KV cache
|
8255
|
+
if (hparams.causal_attn) {
|
8256
|
+
llama_kv_cache_update(&lctx);
|
7880
8257
|
|
7881
|
-
|
7882
|
-
|
7883
|
-
|
8258
|
+
// if we have enough unused cells before the current head ->
|
8259
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
8260
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
8261
|
+
kv_self.head = 0;
|
8262
|
+
}
|
7884
8263
|
|
7885
|
-
|
7886
|
-
|
7887
|
-
|
7888
|
-
|
7889
|
-
|
8264
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
8265
|
+
return 1;
|
8266
|
+
}
|
8267
|
+
|
8268
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
8269
|
+
// after enough generations, the benefit from this heuristic disappears
|
8270
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
8271
|
+
kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
8272
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
8273
|
+
}
|
7890
8274
|
|
7891
8275
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
7892
8276
|
|
@@ -7896,19 +8280,26 @@ static int llama_decode_internal(
|
|
7896
8280
|
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
7897
8281
|
|
7898
8282
|
// the output is always the last tensor in the graph
|
7899
|
-
struct ggml_tensor * res
|
7900
|
-
struct ggml_tensor *
|
7901
|
-
|
7902
|
-
|
7903
|
-
|
7904
|
-
|
7905
|
-
|
7906
|
-
|
7907
|
-
|
7908
|
-
|
7909
|
-
res = nullptr;
|
8283
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
8284
|
+
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
8285
|
+
|
8286
|
+
if (!hparams.causal_attn) {
|
8287
|
+
res = nullptr; // do not extract logits for embedding models such as BERT
|
8288
|
+
|
8289
|
+
// token or sequence embeddings
|
8290
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
8291
|
+
|
8292
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
7910
8293
|
} else {
|
7911
|
-
|
8294
|
+
if (strcmp(res->name, "result_output") == 0) {
|
8295
|
+
// the token embeddings could be the second to last tensor, or the third to last tensor
|
8296
|
+
if (strcmp(embd->name, "result_norm") != 0) {
|
8297
|
+
embd = gf->nodes[gf->n_nodes - 3];
|
8298
|
+
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
|
8299
|
+
}
|
8300
|
+
} else {
|
8301
|
+
GGML_ASSERT(false && "missing result_output tensor");
|
8302
|
+
}
|
7912
8303
|
}
|
7913
8304
|
|
7914
8305
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -7924,40 +8315,12 @@ static int llama_decode_internal(
|
|
7924
8315
|
n_threads = std::min(4, n_threads);
|
7925
8316
|
}
|
7926
8317
|
|
7927
|
-
#ifdef GGML_USE_MPI
|
7928
|
-
const int64_t n_layer = hparams.n_layer;
|
7929
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
7930
|
-
#endif
|
7931
|
-
|
7932
|
-
#ifdef GGML_USE_METAL
|
7933
|
-
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
7934
|
-
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
7935
|
-
}
|
7936
|
-
#endif
|
7937
|
-
|
7938
|
-
if (lctx.backend_cpu != nullptr) {
|
7939
|
-
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7940
|
-
}
|
7941
|
-
|
7942
8318
|
llama_set_inputs(lctx, batch);
|
7943
8319
|
|
7944
|
-
|
7945
|
-
|
7946
|
-
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
7947
|
-
|
7948
|
-
#ifdef GGML_USE_MPI
|
7949
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
7950
|
-
#endif
|
8320
|
+
llama_graph_compute(lctx, gf, n_threads);
|
7951
8321
|
|
7952
8322
|
// update the kv ring buffer
|
7953
8323
|
{
|
7954
|
-
if (kv_self.has_shift) {
|
7955
|
-
kv_self.has_shift = false;
|
7956
|
-
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
7957
|
-
kv_self.cells[i].delta = 0;
|
7958
|
-
}
|
7959
|
-
}
|
7960
|
-
|
7961
8324
|
kv_self.head += n_tokens;
|
7962
8325
|
|
7963
8326
|
// Ensure kv cache head points to a valid index.
|
@@ -7966,6 +8329,18 @@ static int llama_decode_internal(
|
|
7966
8329
|
}
|
7967
8330
|
}
|
7968
8331
|
|
8332
|
+
// decide if we need to defrag the kv cache
|
8333
|
+
if (cparams.defrag_thold >= 0.0f) {
|
8334
|
+
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
|
8335
|
+
|
8336
|
+
// queue defragmentation for next llama_kv_cache_update
|
8337
|
+
if (fragmentation > cparams.defrag_thold) {
|
8338
|
+
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
|
8339
|
+
|
8340
|
+
llama_kv_cache_defrag(kv_self);
|
8341
|
+
}
|
8342
|
+
}
|
8343
|
+
|
7969
8344
|
#ifdef GGML_PERF
|
7970
8345
|
// print timing information per ggml operation (for debugging purposes)
|
7971
8346
|
// requires GGML_PERF to be defined
|
@@ -7991,66 +8366,341 @@ static int llama_decode_internal(
|
|
7991
8366
|
logits_out.clear();
|
7992
8367
|
#endif
|
7993
8368
|
|
7994
|
-
ggml_backend_t
|
7995
|
-
GGML_ASSERT(
|
8369
|
+
ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
8370
|
+
GGML_ASSERT(backend_res != nullptr);
|
8371
|
+
|
7996
8372
|
if (batch.logits) {
|
7997
8373
|
logits_out.resize(n_vocab * n_tokens);
|
7998
8374
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
7999
8375
|
if (batch.logits[i] == 0) {
|
8000
8376
|
continue;
|
8001
8377
|
}
|
8002
|
-
ggml_backend_tensor_get_async(
|
8378
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
8003
8379
|
#ifndef NDEBUG
|
8004
8380
|
logits_valid[i] = true;
|
8005
8381
|
#endif
|
8006
8382
|
}
|
8007
8383
|
} else if (lctx.logits_all) {
|
8008
8384
|
logits_out.resize(n_vocab * n_tokens);
|
8009
|
-
ggml_backend_tensor_get_async(
|
8385
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
8010
8386
|
#ifndef NDEBUG
|
8011
8387
|
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
8012
8388
|
#endif
|
8013
8389
|
} else {
|
8014
8390
|
logits_out.resize(n_vocab);
|
8015
|
-
ggml_backend_tensor_get_async(
|
8391
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
8016
8392
|
#ifndef NDEBUG
|
8017
8393
|
logits_valid[0] = true;
|
8018
8394
|
#endif
|
8019
8395
|
}
|
8020
|
-
ggml_backend_synchronize(
|
8021
|
-
}
|
8396
|
+
ggml_backend_synchronize(backend_res);
|
8397
|
+
}
|
8398
|
+
|
8399
|
+
// extract embeddings
|
8400
|
+
if (cparams.embeddings && embd) {
|
8401
|
+
ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
|
8402
|
+
GGML_ASSERT(backend_embd != nullptr);
|
8403
|
+
|
8404
|
+
switch (cparams.pooling_type) {
|
8405
|
+
case LLAMA_POOLING_TYPE_NONE:
|
8406
|
+
{
|
8407
|
+
// extract token embeddings
|
8408
|
+
auto & embd_out = lctx.embd;
|
8409
|
+
|
8410
|
+
if (batch.logits) {
|
8411
|
+
embd_out.resize(n_embd * n_tokens);
|
8412
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8413
|
+
if (batch.logits[i] == 0) {
|
8414
|
+
continue;
|
8415
|
+
}
|
8416
|
+
|
8417
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
8418
|
+
}
|
8419
|
+
}
|
8420
|
+
} break;
|
8421
|
+
case LLAMA_POOLING_TYPE_CLS:
|
8422
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
8423
|
+
{
|
8424
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
8425
|
+
|
8426
|
+
// extract sequence embeddings
|
8427
|
+
auto & embd_seq_out = lctx.embd_seq;
|
8428
|
+
embd_seq_out.clear();
|
8429
|
+
|
8430
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8431
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
8432
|
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
8433
|
+
continue;
|
8434
|
+
}
|
8435
|
+
embd_seq_out[seq_id].resize(n_embd);
|
8436
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
8437
|
+
}
|
8438
|
+
} break;
|
8439
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
8440
|
+
{
|
8441
|
+
GGML_ASSERT(false && "unknown pooling type");
|
8442
|
+
} break;
|
8443
|
+
}
|
8444
|
+
ggml_backend_synchronize(backend_embd);
|
8445
|
+
}
|
8446
|
+
|
8447
|
+
// measure the performance only for the single-token evals
|
8448
|
+
if (n_tokens == 1) {
|
8449
|
+
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
8450
|
+
lctx.n_eval++;
|
8451
|
+
}
|
8452
|
+
else if (n_tokens > 1) {
|
8453
|
+
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
8454
|
+
lctx.n_p_eval += n_tokens;
|
8455
|
+
}
|
8456
|
+
|
8457
|
+
// get a more accurate load time, upon first eval
|
8458
|
+
// TODO: fix this
|
8459
|
+
if (!lctx.has_evaluated_once) {
|
8460
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
8461
|
+
lctx.has_evaluated_once = true;
|
8462
|
+
}
|
8463
|
+
|
8464
|
+
return 0;
|
8465
|
+
}
|
8466
|
+
|
8467
|
+
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
8468
|
+
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
8469
|
+
auto & kv_self = lctx.kv_self;
|
8470
|
+
|
8471
|
+
const auto & hparams = lctx.model.hparams;
|
8472
|
+
|
8473
|
+
const uint32_t n_layer = hparams.n_layer;
|
8474
|
+
|
8475
|
+
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
|
8476
|
+
const uint32_t n_used = kv_self.used;
|
8477
|
+
|
8478
|
+
assert(n_used <= n_kv);
|
8479
|
+
|
8480
|
+
//const int64_t t_start = ggml_time_us();
|
8481
|
+
|
8482
|
+
// number of cells moved
|
8483
|
+
uint32_t n_moves = 0;
|
8484
|
+
|
8485
|
+
// determine which KV cells to move where
|
8486
|
+
//
|
8487
|
+
// cell i moves to ids[i]
|
8488
|
+
//
|
8489
|
+
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
8490
|
+
//
|
8491
|
+
std::vector<uint32_t> ids(n_kv, n_kv);
|
8492
|
+
|
8493
|
+
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
8494
|
+
const auto & cell0 = kv_self.cells[i0];
|
8495
|
+
|
8496
|
+
if (!cell0.is_empty()) {
|
8497
|
+
ids[i0] = i0;
|
8498
|
+
|
8499
|
+
continue;
|
8500
|
+
}
|
8501
|
+
|
8502
|
+
// found a hole - fill it with data from the end of the cache
|
8503
|
+
|
8504
|
+
uint32_t nh = 1;
|
8505
|
+
|
8506
|
+
// determine the size of the hole
|
8507
|
+
while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
|
8508
|
+
nh++;
|
8509
|
+
}
|
8510
|
+
|
8511
|
+
// each move requires 6*n_layer tensors (see build_defrag)
|
8512
|
+
// - source view, destination view, copy operation
|
8513
|
+
// - x2 for keys and values
|
8514
|
+
//
|
8515
|
+
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
8516
|
+
// the graph is too big, we cannot move more cells
|
8517
|
+
break;
|
8518
|
+
}
|
8519
|
+
|
8520
|
+
uint32_t nf = 0;
|
8521
|
+
uint32_t is = n_kv - 1;
|
8522
|
+
|
8523
|
+
// starting from the end, find nh non-empty cells
|
8524
|
+
for (; is > i0; --is) {
|
8525
|
+
const auto & cell1 = kv_self.cells[is];
|
8526
|
+
|
8527
|
+
if (cell1.is_empty() || ids[is] != n_kv) {
|
8528
|
+
continue;
|
8529
|
+
}
|
8530
|
+
|
8531
|
+
// non-empty cell which is not yet moved
|
8532
|
+
nf++;
|
8533
|
+
|
8534
|
+
if (nf == nh) {
|
8535
|
+
break;
|
8536
|
+
}
|
8537
|
+
}
|
8538
|
+
|
8539
|
+
// this can only happen if `n_used` is not accurate, which would be a bug
|
8540
|
+
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
8541
|
+
|
8542
|
+
nf = 0;
|
8543
|
+
|
8544
|
+
uint32_t i1 = is;
|
8545
|
+
|
8546
|
+
// are we moving a continuous block of memory?
|
8547
|
+
bool cont = false;
|
8548
|
+
|
8549
|
+
// go back and move the nf cells to the hole
|
8550
|
+
for (; i1 < n_kv; ++i1) {
|
8551
|
+
auto & cell1 = kv_self.cells[i1];
|
8552
|
+
|
8553
|
+
if (cell1.is_empty() || ids[i1] != n_kv) {
|
8554
|
+
cont = false;
|
8555
|
+
continue;
|
8556
|
+
}
|
8557
|
+
|
8558
|
+
// this cell goes to (i0 + nf)
|
8559
|
+
ids[i1] = i0 + nf;
|
8560
|
+
|
8561
|
+
// move the cell meta data
|
8562
|
+
kv_self.cells[i0 + nf] = cell1;
|
8563
|
+
|
8564
|
+
// clear the old cell and move the head there
|
8565
|
+
cell1 = llama_kv_cell();
|
8566
|
+
kv_self.head = n_used;
|
8567
|
+
|
8568
|
+
if (!cont) {
|
8569
|
+
n_moves++;
|
8570
|
+
cont = true;
|
8571
|
+
}
|
8572
|
+
|
8573
|
+
nf++;
|
8574
|
+
|
8575
|
+
if (nf == nh) {
|
8576
|
+
break;
|
8577
|
+
}
|
8578
|
+
}
|
8579
|
+
|
8580
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
8581
|
+
|
8582
|
+
i0 += nh - 1;
|
8583
|
+
}
|
8584
|
+
|
8585
|
+
if (n_moves == 0) {
|
8586
|
+
return;
|
8587
|
+
}
|
8588
|
+
|
8589
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
8590
|
+
|
8591
|
+
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
8592
|
+
|
8593
|
+
#if 0
|
8594
|
+
// CPU defrag
|
8595
|
+
//
|
8596
|
+
// TODO: optimizations are possible:
|
8597
|
+
// - multiple threads
|
8598
|
+
// - avoid copying to the host memory when already there
|
8599
|
+
//
|
8600
|
+
// likely not worth the effort, as we have ggml_graph based defrag
|
8601
|
+
//
|
8602
|
+
|
8603
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
8604
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
8605
|
+
|
8606
|
+
const uint32_t kv_size = kv_self.size;
|
8607
|
+
|
8608
|
+
std::vector<uint8_t> buf_k;
|
8609
|
+
std::vector<uint8_t> buf_v;
|
8610
|
+
|
8611
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
8612
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
8613
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
|
8614
|
+
|
8615
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
8616
|
+
const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
|
8617
|
+
|
8618
|
+
buf_k.resize(k_size);
|
8619
|
+
buf_v.resize(v_size);
|
8620
|
+
|
8621
|
+
ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
8622
|
+
ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
8623
|
+
|
8624
|
+
// batch move [i, i+nm) to [id, id+nm)
|
8625
|
+
// note: cells can move only to a lower index
|
8626
|
+
for (uint32_t i = 0; i < n_kv; ++i) {
|
8627
|
+
const uint32_t id = ids[i];
|
8628
|
+
|
8629
|
+
if (i == id || id == n_kv) {
|
8630
|
+
continue;
|
8631
|
+
}
|
8632
|
+
|
8633
|
+
uint32_t nm = 1;
|
8634
|
+
|
8635
|
+
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
8636
|
+
nm++;
|
8637
|
+
}
|
8638
|
+
|
8639
|
+
// move keys
|
8640
|
+
{
|
8641
|
+
const int64_t os = i*k_size_row;
|
8642
|
+
const int64_t od = id*k_size_row;
|
8643
|
+
|
8644
|
+
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
8645
|
+
}
|
8646
|
+
|
8647
|
+
// move values (note: they are transposed)
|
8648
|
+
{
|
8649
|
+
const int64_t os = i;
|
8650
|
+
const int64_t od = id;
|
8651
|
+
|
8652
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
8653
|
+
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
8654
|
+
}
|
8655
|
+
}
|
8656
|
+
|
8657
|
+
i += nm - 1;
|
8658
|
+
}
|
8659
|
+
|
8660
|
+
ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
8661
|
+
ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
8662
|
+
}
|
8663
|
+
#else
|
8664
|
+
// ggml_graph defrag
|
8665
|
+
|
8666
|
+
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
8667
|
+
|
8668
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
8669
|
+
#endif
|
8670
|
+
|
8671
|
+
//const int64_t t_end = ggml_time_us();
|
8672
|
+
|
8673
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
|
8674
|
+
}
|
8675
|
+
|
8676
|
+
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
8677
|
+
// apply K-shift if needed
|
8678
|
+
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
8679
|
+
llama_set_k_shift(lctx);
|
8680
|
+
|
8681
|
+
{
|
8682
|
+
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
8683
|
+
|
8684
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
8685
|
+
}
|
8022
8686
|
|
8023
|
-
|
8024
|
-
|
8025
|
-
auto & embedding_out = lctx.embedding;
|
8687
|
+
{
|
8688
|
+
auto & kv_self = lctx.kv_self;
|
8026
8689
|
|
8027
|
-
|
8028
|
-
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
8690
|
+
kv_self.has_shift = false;
|
8029
8691
|
|
8030
|
-
|
8031
|
-
|
8032
|
-
|
8033
|
-
|
8692
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
8693
|
+
kv_self.cells[i].delta = 0;
|
8694
|
+
}
|
8695
|
+
}
|
8034
8696
|
}
|
8035
8697
|
|
8036
|
-
//
|
8037
|
-
if (
|
8038
|
-
lctx
|
8039
|
-
lctx.n_eval++;
|
8040
|
-
}
|
8041
|
-
else if (n_tokens > 1) {
|
8042
|
-
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
8043
|
-
lctx.n_p_eval += n_tokens;
|
8044
|
-
}
|
8698
|
+
// defragment the KV cache if needed
|
8699
|
+
if (lctx.kv_self.do_defrag) {
|
8700
|
+
llama_kv_cache_defrag_internal(lctx);
|
8045
8701
|
|
8046
|
-
|
8047
|
-
// TODO: fix this
|
8048
|
-
if (!lctx.has_evaluated_once) {
|
8049
|
-
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
8050
|
-
lctx.has_evaluated_once = true;
|
8702
|
+
lctx.kv_self.do_defrag = false;
|
8051
8703
|
}
|
8052
|
-
|
8053
|
-
return 0;
|
8054
8704
|
}
|
8055
8705
|
|
8056
8706
|
//
|
@@ -8085,19 +8735,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
8085
8735
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
8086
8736
|
const auto& token_data = vocab.id_to_token.at(id);
|
8087
8737
|
switch (llama_vocab_get_type(vocab)) {
|
8088
|
-
|
8089
|
-
|
8090
|
-
|
8091
|
-
|
8092
|
-
|
8093
|
-
|
8094
|
-
|
8095
|
-
|
8096
|
-
|
8097
|
-
|
8098
|
-
|
8099
|
-
|
8100
|
-
|
8738
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
8739
|
+
auto buf = token_data.text.substr(3, 2);
|
8740
|
+
return strtol(buf.c_str(), NULL, 16);
|
8741
|
+
}
|
8742
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
8743
|
+
GGML_ASSERT(false);
|
8744
|
+
return unicode_to_bytes_bpe(token_data.text);
|
8745
|
+
}
|
8746
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
8747
|
+
GGML_ASSERT(false);
|
8748
|
+
}
|
8749
|
+
default:
|
8750
|
+
GGML_ASSERT(false);
|
8101
8751
|
}
|
8102
8752
|
}
|
8103
8753
|
|
@@ -8644,37 +9294,46 @@ struct llm_tokenizer_wpm {
|
|
8644
9294
|
}
|
8645
9295
|
|
8646
9296
|
std::vector<std::string> preprocess(const std::string & text) {
|
8647
|
-
|
8648
|
-
|
9297
|
+
// normalalization form D
|
9298
|
+
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
9299
|
+
std::vector<uint32_t> nfd_codepoints;
|
9300
|
+
for (uint32_t code : codepoints) {
|
9301
|
+
auto it = nfd_map.equal_range(code);
|
9302
|
+
if (it.first != it.second) {
|
9303
|
+
for (auto jt = it.first; jt != it.second; jt++) {
|
9304
|
+
nfd_codepoints.push_back(jt->second);
|
9305
|
+
}
|
9306
|
+
} else {
|
9307
|
+
nfd_codepoints.push_back(code);
|
9308
|
+
}
|
9309
|
+
}
|
8649
9310
|
|
8650
|
-
//
|
8651
|
-
//
|
8652
|
-
std::vector<std::string> words;
|
9311
|
+
// strip accents, strip control, uniformize whitespace,
|
9312
|
+
// to lowercase, pad chinese characters, pad punctuation
|
8653
9313
|
std::string new_str = "";
|
8654
|
-
|
8655
|
-
|
8656
|
-
|
8657
|
-
|
8658
|
-
|
8659
|
-
|
8660
|
-
|
8661
|
-
|
9314
|
+
for (uint32_t code : nfd_codepoints) {
|
9315
|
+
int type = codepoint_type(code);
|
9316
|
+
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
9317
|
+
continue;
|
9318
|
+
}
|
9319
|
+
code = to_lower(code);
|
9320
|
+
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
9321
|
+
code = ' ';
|
8662
9322
|
}
|
8663
|
-
|
9323
|
+
std::string s = codepoint_to_utf8(code);
|
9324
|
+
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
|
8664
9325
|
new_str += " ";
|
8665
|
-
new_str +=
|
9326
|
+
new_str += s;
|
8666
9327
|
new_str += " ";
|
8667
|
-
|
8668
|
-
|
8669
|
-
else {
|
8670
|
-
new_str += ori_str[i];
|
8671
|
-
i += 1;
|
9328
|
+
} else {
|
9329
|
+
new_str += s;
|
8672
9330
|
}
|
8673
9331
|
}
|
8674
9332
|
|
8675
9333
|
// split by whitespace
|
8676
9334
|
uint64_t l = 0;
|
8677
9335
|
uint64_t r = 0;
|
9336
|
+
std::vector<std::string> words;
|
8678
9337
|
while (r < new_str.size()) {
|
8679
9338
|
// if is whitespace
|
8680
9339
|
if (isspace(new_str[r])) {
|
@@ -8692,47 +9351,21 @@ struct llm_tokenizer_wpm {
|
|
8692
9351
|
return words;
|
8693
9352
|
}
|
8694
9353
|
|
8695
|
-
|
8696
|
-
|
8697
|
-
|
8698
|
-
|
8699
|
-
|
8700
|
-
if (c >= 'A' && c <= 'Z') {
|
8701
|
-
text2[i] = c - 'A' + 'a';
|
8702
|
-
}
|
9354
|
+
uint32_t to_lower(uint32_t code) {
|
9355
|
+
static const std::locale locale("en_US.UTF-8");
|
9356
|
+
#if defined(_WIN32)
|
9357
|
+
if (code > 0xFFFF) {
|
9358
|
+
return code;
|
8703
9359
|
}
|
8704
|
-
|
9360
|
+
#endif
|
9361
|
+
return std::tolower(wchar_t(code), locale);
|
8705
9362
|
}
|
8706
9363
|
|
8707
|
-
bool
|
8708
|
-
|
8709
|
-
|
8710
|
-
|
8711
|
-
|
8712
|
-
unsigned char ch = static_cast<unsigned char>(str[i]);
|
8713
|
-
if (ch <= 0x7f) {
|
8714
|
-
codepoint = ch;
|
8715
|
-
num_bytes = 1;
|
8716
|
-
} else if ((ch >> 5) == 0x06) {
|
8717
|
-
codepoint = ch & 0x1f;
|
8718
|
-
num_bytes = 2;
|
8719
|
-
} else if ((ch >> 4) == 0x0e) {
|
8720
|
-
codepoint = ch & 0x0f;
|
8721
|
-
num_bytes = 3;
|
8722
|
-
} else if ((ch >> 3) == 0x1e) {
|
8723
|
-
codepoint = ch & 0x07;
|
8724
|
-
num_bytes = 4;
|
8725
|
-
}
|
8726
|
-
for (int j = 1; j < num_bytes; ++j) {
|
8727
|
-
if (i + j >= len) {
|
8728
|
-
return false; // incomplete UTF-8 character
|
8729
|
-
}
|
8730
|
-
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
8731
|
-
if ((next_ch >> 6) != 0x02) {
|
8732
|
-
return false; // invalid trailing byte
|
8733
|
-
}
|
8734
|
-
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
8735
|
-
}
|
9364
|
+
bool is_ascii_punct(uint32_t code) {
|
9365
|
+
return code < 256 && ispunct(code);
|
9366
|
+
}
|
9367
|
+
|
9368
|
+
bool is_chinese_char(uint32_t codepoint) {
|
8736
9369
|
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
8737
9370
|
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
8738
9371
|
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
@@ -8748,41 +9381,6 @@ struct llm_tokenizer_wpm {
|
|
8748
9381
|
return false;
|
8749
9382
|
}
|
8750
9383
|
|
8751
|
-
std::string strip_accents(const std::string & input_string) {
|
8752
|
-
std::string resultString;
|
8753
|
-
std::map<std::string, char> accent_map = {
|
8754
|
-
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
8755
|
-
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
8756
|
-
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
8757
|
-
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
8758
|
-
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
8759
|
-
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
8760
|
-
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
8761
|
-
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
8762
|
-
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
8763
|
-
};
|
8764
|
-
|
8765
|
-
for (size_t i = 0; i < input_string.length();) {
|
8766
|
-
int len = utf8_len(input_string[i]);
|
8767
|
-
std::string curChar = input_string.substr(i, len);
|
8768
|
-
auto iter = accent_map.find(curChar);
|
8769
|
-
if (iter != accent_map.end()) {
|
8770
|
-
resultString += iter->second;
|
8771
|
-
} else {
|
8772
|
-
resultString += curChar;
|
8773
|
-
}
|
8774
|
-
i += len;
|
8775
|
-
}
|
8776
|
-
|
8777
|
-
return resultString;
|
8778
|
-
}
|
8779
|
-
|
8780
|
-
static size_t utf8_len(char src) {
|
8781
|
-
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
8782
|
-
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
8783
|
-
return lookup[highbits];
|
8784
|
-
}
|
8785
|
-
|
8786
9384
|
const llama_vocab & vocab;
|
8787
9385
|
};
|
8788
9386
|
|
@@ -9816,10 +10414,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
|
|
9816
10414
|
}
|
9817
10415
|
}
|
9818
10416
|
|
9819
|
-
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
9820
|
-
llama_sample_temp(ctx, candidates_p, temp);
|
9821
|
-
}
|
9822
|
-
|
9823
10417
|
void llama_sample_repetition_penalties(
|
9824
10418
|
struct llama_context * ctx,
|
9825
10419
|
llama_token_data_array * candidates,
|
@@ -9946,38 +10540,6 @@ void llama_sample_apply_guidance(
|
|
9946
10540
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9947
10541
|
}
|
9948
10542
|
|
9949
|
-
void llama_sample_classifier_free_guidance(
|
9950
|
-
struct llama_context * ctx,
|
9951
|
-
llama_token_data_array * candidates,
|
9952
|
-
struct llama_context * guidance_ctx,
|
9953
|
-
float scale) {
|
9954
|
-
GGML_ASSERT(ctx);
|
9955
|
-
int64_t t_start_sample_us;
|
9956
|
-
|
9957
|
-
t_start_sample_us = ggml_time_us();
|
9958
|
-
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
9959
|
-
|
9960
|
-
GGML_ASSERT(n_vocab == candidates->size);
|
9961
|
-
GGML_ASSERT(!candidates->sorted);
|
9962
|
-
|
9963
|
-
std::vector<float> logits_base(n_vocab);
|
9964
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
9965
|
-
logits_base[i] = candidates->data[i].logit;
|
9966
|
-
}
|
9967
|
-
|
9968
|
-
float * logits_guidance = llama_get_logits(guidance_ctx);
|
9969
|
-
|
9970
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9971
|
-
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
|
9972
|
-
t_start_sample_us = ggml_time_us();
|
9973
|
-
|
9974
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
9975
|
-
candidates->data[i].logit = logits_base[i];
|
9976
|
-
}
|
9977
|
-
|
9978
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9979
|
-
}
|
9980
|
-
|
9981
10543
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
9982
10544
|
GGML_ASSERT(ctx);
|
9983
10545
|
|
@@ -10411,7 +10973,7 @@ struct quantize_state_internal {
|
|
10411
10973
|
{}
|
10412
10974
|
};
|
10413
10975
|
|
10414
|
-
static void
|
10976
|
+
static void llama_tensor_dequantize_internal(
|
10415
10977
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
10416
10978
|
const size_t nelements, const int nthread
|
10417
10979
|
) {
|
@@ -10508,31 +11070,47 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10508
11070
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
10509
11071
|
new_type = GGML_TYPE_Q8_0;
|
10510
11072
|
}
|
10511
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype ==
|
11073
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
11074
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10512
11075
|
new_type = GGML_TYPE_Q5_K;
|
10513
11076
|
}
|
10514
11077
|
else if (new_type != GGML_TYPE_Q8_0) {
|
10515
11078
|
new_type = GGML_TYPE_Q6_K;
|
10516
11079
|
}
|
10517
11080
|
} else if (name == "token_embd.weight") {
|
10518
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
11081
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
11082
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10519
11083
|
new_type = GGML_TYPE_Q2_K;
|
10520
11084
|
}
|
11085
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
11086
|
+
new_type = GGML_TYPE_IQ3_S;
|
11087
|
+
}
|
10521
11088
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10522
|
-
new_type =
|
11089
|
+
new_type = GGML_TYPE_IQ3_S;
|
10523
11090
|
}
|
10524
|
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S
|
11091
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
11092
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10525
11093
|
if (name.find("attn_v.weight") != std::string::npos) {
|
10526
11094
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
10527
|
-
else new_type = GGML_TYPE_Q2_K;
|
11095
|
+
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
10528
11096
|
++qs.i_attention_wv;
|
10529
11097
|
}
|
11098
|
+
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
|
11099
|
+
new_type = GGML_TYPE_Q4_K;
|
11100
|
+
}
|
10530
11101
|
else if (name.find("ffn_down") != std::string::npos) {
|
10531
|
-
if (qs.i_ffn_down < qs.n_ffn_down/8)
|
11102
|
+
if (qs.i_ffn_down < qs.n_ffn_down/8) {
|
11103
|
+
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
11104
|
+
}
|
10532
11105
|
++qs.i_ffn_down;
|
10533
11106
|
}
|
10534
11107
|
else if (name.find("attn_output.weight") != std::string::npos) {
|
10535
|
-
if (
|
11108
|
+
if (qs.model.hparams.n_expert == 8) {
|
11109
|
+
new_type = GGML_TYPE_Q5_K;
|
11110
|
+
} else {
|
11111
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
11112
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
11113
|
+
}
|
10536
11114
|
}
|
10537
11115
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
10538
11116
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
@@ -10542,13 +11120,25 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10542
11120
|
new_type = GGML_TYPE_Q4_K;
|
10543
11121
|
}
|
10544
11122
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10545
|
-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ?
|
11123
|
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
11124
|
+
}
|
11125
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
11126
|
+
new_type = GGML_TYPE_Q4_K;
|
11127
|
+
}
|
11128
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
11129
|
+
new_type = GGML_TYPE_Q4_K;
|
11130
|
+
}
|
11131
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
11132
|
+
new_type = GGML_TYPE_Q4_K;
|
11133
|
+
}
|
11134
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
11135
|
+
new_type = GGML_TYPE_Q4_K;
|
10546
11136
|
}
|
10547
11137
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
10548
11138
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
10549
11139
|
}
|
10550
11140
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10551
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
11141
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
|
10552
11142
|
new_type = GGML_TYPE_Q5_K;
|
10553
11143
|
}
|
10554
11144
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
@@ -10574,14 +11164,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10574
11164
|
// TODO: explore better strategies
|
10575
11165
|
new_type = GGML_TYPE_Q8_0;
|
10576
11166
|
}
|
10577
|
-
else if (ftype ==
|
10578
|
-
new_type =
|
11167
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
11168
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
11169
|
+
}
|
11170
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
11171
|
+
new_type = GGML_TYPE_IQ2_S;
|
11172
|
+
}
|
11173
|
+
} else if (name.find("attn_q.weight") != std::string::npos) {
|
11174
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
11175
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
11176
|
+
}
|
11177
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
11178
|
+
new_type = GGML_TYPE_IQ2_S;
|
10579
11179
|
}
|
10580
11180
|
} else if (name.find("ffn_down") != std::string::npos) {
|
10581
11181
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
10582
11182
|
int i_layer = info.first, n_layer = info.second;
|
10583
11183
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
10584
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S
|
11184
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
10585
11185
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
10586
11186
|
}
|
10587
11187
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
@@ -10592,6 +11192,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10592
11192
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
10593
11193
|
: GGML_TYPE_Q3_K;
|
10594
11194
|
}
|
11195
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
11196
|
+
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
11197
|
+
new_type = GGML_TYPE_Q4_K;
|
11198
|
+
}
|
10595
11199
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
10596
11200
|
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
10597
11201
|
}
|
@@ -10603,8 +11207,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10603
11207
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10604
11208
|
}
|
10605
11209
|
}
|
10606
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
10607
|
-
|
11210
|
+
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
|
11211
|
+
new_type = GGML_TYPE_Q5_K;
|
10608
11212
|
}
|
10609
11213
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10610
11214
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
@@ -10621,39 +11225,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10621
11225
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
10622
11226
|
if (arch != LLM_ARCH_FALCON) {
|
10623
11227
|
if (qs.model.hparams.n_expert == 8) {
|
10624
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype ==
|
11228
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
10625
11229
|
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
10626
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M
|
11230
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
11231
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
|
10627
11232
|
new_type = GGML_TYPE_Q5_K;
|
10628
11233
|
}
|
10629
11234
|
} else {
|
10630
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K
|
10631
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type =
|
10632
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
10633
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
11235
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
11236
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
|
11237
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
|
11238
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
|
11239
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
|
10634
11240
|
}
|
10635
11241
|
} else {
|
10636
11242
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
10637
11243
|
}
|
10638
11244
|
}
|
10639
11245
|
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
10640
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L
|
11246
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
11247
|
+
new_type = GGML_TYPE_Q4_K;
|
11248
|
+
}
|
10641
11249
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
10642
11250
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
10643
11251
|
}
|
10644
11252
|
else if (name.find("ffn_gate") != std::string::npos) {
|
10645
11253
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
10646
11254
|
int i_layer = info.first, n_layer = info.second;
|
10647
|
-
if (ftype ==
|
10648
|
-
new_type =
|
11255
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
11256
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10649
11257
|
}
|
10650
11258
|
++qs.i_ffn_gate;
|
10651
11259
|
}
|
10652
11260
|
else if (name.find("ffn_up") != std::string::npos) {
|
10653
11261
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
10654
11262
|
int i_layer = info.first, n_layer = info.second;
|
10655
|
-
if (ftype ==
|
10656
|
-
new_type =
|
11263
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
11264
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10657
11265
|
}
|
10658
11266
|
++qs.i_ffn_up;
|
10659
11267
|
}
|
@@ -10671,9 +11279,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10671
11279
|
//}
|
10672
11280
|
bool convert_incompatible_tensor = false;
|
10673
11281
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
10674
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
10675
|
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
10676
|
-
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
11282
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
11283
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
11284
|
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
|
10677
11285
|
int nx = tensor->ne[0];
|
10678
11286
|
int ny = tensor->ne[1];
|
10679
11287
|
if (nx % QK_K != 0) {
|
@@ -10687,13 +11295,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10687
11295
|
switch (new_type) {
|
10688
11296
|
case GGML_TYPE_IQ2_XXS:
|
10689
11297
|
case GGML_TYPE_IQ2_XS:
|
11298
|
+
case GGML_TYPE_IQ2_S:
|
10690
11299
|
case GGML_TYPE_IQ3_XXS:
|
11300
|
+
case GGML_TYPE_IQ3_S:
|
10691
11301
|
case GGML_TYPE_IQ1_S:
|
10692
11302
|
case GGML_TYPE_Q2_K:
|
10693
|
-
case GGML_TYPE_Q3_K:
|
10694
|
-
case
|
10695
|
-
case
|
10696
|
-
case
|
11303
|
+
case GGML_TYPE_Q3_K:
|
11304
|
+
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
11305
|
+
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
11306
|
+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
11307
|
+
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
10697
11308
|
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
10698
11309
|
}
|
10699
11310
|
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
@@ -10703,6 +11314,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10703
11314
|
return new_type;
|
10704
11315
|
}
|
10705
11316
|
|
11317
|
+
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
11318
|
+
std::mutex mutex;
|
11319
|
+
int counter = 0;
|
11320
|
+
size_t new_size = 0;
|
11321
|
+
if (nthread < 2) {
|
11322
|
+
// single-thread
|
11323
|
+
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
|
11324
|
+
}
|
11325
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
11326
|
+
nrows, n_per_row, imatrix]() {
|
11327
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
11328
|
+
const int nrows_per_chunk = chunk_size / n_per_row;
|
11329
|
+
size_t local_size = 0;
|
11330
|
+
while (true) {
|
11331
|
+
std::unique_lock<std::mutex> lock(mutex);
|
11332
|
+
int first_row = counter; counter += nrows_per_chunk;
|
11333
|
+
if (first_row >= nrows) {
|
11334
|
+
if (local_size > 0) {
|
11335
|
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
11336
|
+
hist_cur[j] += local_hist[j];
|
11337
|
+
}
|
11338
|
+
new_size += local_size;
|
11339
|
+
}
|
11340
|
+
break;
|
11341
|
+
}
|
11342
|
+
lock.unlock();
|
11343
|
+
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
11344
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
11345
|
+
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
11346
|
+
}
|
11347
|
+
};
|
11348
|
+
for (int it = 0; it < nthread - 1; ++it) {
|
11349
|
+
workers.emplace_back(compute);
|
11350
|
+
}
|
11351
|
+
compute();
|
11352
|
+
for (auto & w : workers) { w.join(); }
|
11353
|
+
workers.clear();
|
11354
|
+
return new_size;
|
11355
|
+
}
|
11356
|
+
|
10706
11357
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
10707
11358
|
ggml_type quantized_type;
|
10708
11359
|
llama_ftype ftype = params->ftype;
|
@@ -10719,7 +11370,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10719
11370
|
// K-quants
|
10720
11371
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
10721
11372
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
10722
|
-
case
|
11373
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
|
10723
11374
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
10724
11375
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
10725
11376
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
@@ -10730,9 +11381,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10730
11381
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
10731
11382
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10732
11383
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
11384
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
|
11385
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
|
10733
11386
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10734
11387
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
10735
11388
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
11389
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
|
11390
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
|
11391
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
|
10736
11392
|
|
10737
11393
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
10738
11394
|
}
|
@@ -10810,7 +11466,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10810
11466
|
|
10811
11467
|
std::vector<std::thread> workers;
|
10812
11468
|
workers.reserve(nthread);
|
10813
|
-
std::mutex mutex;
|
10814
11469
|
|
10815
11470
|
int idx = 0;
|
10816
11471
|
|
@@ -10862,7 +11517,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10862
11517
|
quantize &= !params->only_copy;
|
10863
11518
|
|
10864
11519
|
// do not quantize expert gating tensors
|
10865
|
-
|
11520
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
11521
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
10866
11522
|
|
10867
11523
|
// do not quantize positional embeddings and token types (BERT)
|
10868
11524
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
@@ -10906,6 +11562,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10906
11562
|
}
|
10907
11563
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
10908
11564
|
new_type == GGML_TYPE_IQ2_XS ||
|
11565
|
+
new_type == GGML_TYPE_IQ2_S ||
|
10909
11566
|
new_type == GGML_TYPE_IQ1_S ||
|
10910
11567
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
10911
11568
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
@@ -10922,7 +11579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10922
11579
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
10923
11580
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
10924
11581
|
} else {
|
10925
|
-
|
11582
|
+
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
10926
11583
|
f32_data = (float *) f32_conv_buf.data();
|
10927
11584
|
}
|
10928
11585
|
|
@@ -10943,41 +11600,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10943
11600
|
|
10944
11601
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
10945
11602
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
10946
|
-
|
10947
|
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
10948
|
-
} else {
|
10949
|
-
int counter = 0;
|
10950
|
-
new_size = 0;
|
10951
|
-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
10952
|
-
nrows, n_per_row, imatrix]() {
|
10953
|
-
std::array<int64_t, 1 << 4> local_hist = {};
|
10954
|
-
const int nrows_per_chunk = chunk_size / n_per_row;
|
10955
|
-
size_t local_size = 0;
|
10956
|
-
while (true) {
|
10957
|
-
std::unique_lock<std::mutex> lock(mutex);
|
10958
|
-
int first_row = counter; counter += nrows_per_chunk;
|
10959
|
-
if (first_row >= nrows) {
|
10960
|
-
if (local_size > 0) {
|
10961
|
-
for (int j=0; j<int(local_hist.size()); ++j) {
|
10962
|
-
hist_cur[j] += local_hist[j];
|
10963
|
-
}
|
10964
|
-
new_size += local_size;
|
10965
|
-
}
|
10966
|
-
break;
|
10967
|
-
}
|
10968
|
-
lock.unlock();
|
10969
|
-
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
10970
|
-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
10971
|
-
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
10972
|
-
}
|
10973
|
-
};
|
10974
|
-
for (int it = 0; it < nthread_use - 1; ++it) {
|
10975
|
-
workers.emplace_back(compute);
|
10976
|
-
}
|
10977
|
-
compute();
|
10978
|
-
for (auto & w : workers) { w.join(); }
|
10979
|
-
workers.clear();
|
10980
|
-
}
|
11603
|
+
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
|
10981
11604
|
|
10982
11605
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
10983
11606
|
int64_t tot_count = 0;
|
@@ -11327,7 +11950,7 @@ static int llama_apply_lora_from_file_internal(
|
|
11327
11950
|
struct llama_model_params llama_model_default_params() {
|
11328
11951
|
struct llama_model_params result = {
|
11329
11952
|
/*.n_gpu_layers =*/ 0,
|
11330
|
-
/*.split_mode =*/
|
11953
|
+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
11331
11954
|
/*.main_gpu =*/ 0,
|
11332
11955
|
/*.tensor_split =*/ nullptr,
|
11333
11956
|
/*.progress_callback =*/ nullptr,
|
@@ -11353,7 +11976,8 @@ struct llama_context_params llama_context_default_params() {
|
|
11353
11976
|
/*.n_batch =*/ 512,
|
11354
11977
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
11355
11978
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
11356
|
-
/*.rope_scaling_type =*/
|
11979
|
+
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
11980
|
+
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
11357
11981
|
/*.rope_freq_base =*/ 0.0f,
|
11358
11982
|
/*.rope_freq_scale =*/ 0.0f,
|
11359
11983
|
/*.yarn_ext_factor =*/ -1.0f,
|
@@ -11361,15 +11985,16 @@ struct llama_context_params llama_context_default_params() {
|
|
11361
11985
|
/*.yarn_beta_fast =*/ 32.0f,
|
11362
11986
|
/*.yarn_beta_slow =*/ 1.0f,
|
11363
11987
|
/*.yarn_orig_ctx =*/ 0,
|
11988
|
+
/*.defrag_thold =*/ -1.0f,
|
11364
11989
|
/*.cb_eval =*/ nullptr,
|
11365
11990
|
/*.cb_eval_user_data =*/ nullptr,
|
11366
11991
|
/*.type_k =*/ GGML_TYPE_F16,
|
11367
11992
|
/*.type_v =*/ GGML_TYPE_F16,
|
11368
|
-
/*.mul_mat_q =*/ true,
|
11369
11993
|
/*.logits_all =*/ false,
|
11370
|
-
/*.
|
11994
|
+
/*.embeddings =*/ false,
|
11371
11995
|
/*.offload_kqv =*/ true,
|
11372
|
-
/*.
|
11996
|
+
/*.abort_callback =*/ nullptr,
|
11997
|
+
/*.abort_callback_data =*/ nullptr,
|
11373
11998
|
};
|
11374
11999
|
|
11375
12000
|
return result;
|
@@ -11421,15 +12046,6 @@ bool llama_supports_gpu_offload(void) {
|
|
11421
12046
|
#endif
|
11422
12047
|
}
|
11423
12048
|
|
11424
|
-
// deprecated:
|
11425
|
-
bool llama_mmap_supported(void) {
|
11426
|
-
return llama_supports_mmap();
|
11427
|
-
}
|
11428
|
-
|
11429
|
-
bool llama_mlock_supported(void) {
|
11430
|
-
return llama_supports_mlock();
|
11431
|
-
}
|
11432
|
-
|
11433
12049
|
void llama_backend_init(void) {
|
11434
12050
|
ggml_time_init();
|
11435
12051
|
|
@@ -11525,9 +12141,10 @@ struct llama_context * llama_new_context_with_model(
|
|
11525
12141
|
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
11526
12142
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
11527
12143
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
11528
|
-
cparams.
|
12144
|
+
cparams.defrag_thold = params.defrag_thold;
|
12145
|
+
cparams.embeddings = params.embeddings;
|
11529
12146
|
cparams.offload_kqv = params.offload_kqv;
|
11530
|
-
cparams.
|
12147
|
+
cparams.pooling_type = params.pooling_type;
|
11531
12148
|
|
11532
12149
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
11533
12150
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -11541,16 +12158,24 @@ struct llama_context * llama_new_context_with_model(
|
|
11541
12158
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
11542
12159
|
|
11543
12160
|
auto rope_scaling_type = params.rope_scaling_type;
|
11544
|
-
if (rope_scaling_type ==
|
12161
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
11545
12162
|
rope_scaling_type = hparams.rope_scaling_type_train;
|
11546
12163
|
}
|
11547
12164
|
|
11548
|
-
if (rope_scaling_type ==
|
12165
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
|
11549
12166
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
11550
12167
|
}
|
11551
12168
|
|
11552
12169
|
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
11553
|
-
cparams.yarn_ext_factor = rope_scaling_type ==
|
12170
|
+
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
12171
|
+
}
|
12172
|
+
|
12173
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
12174
|
+
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
12175
|
+
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
12176
|
+
} else {
|
12177
|
+
cparams.pooling_type = hparams.pooling_type;
|
12178
|
+
}
|
11554
12179
|
}
|
11555
12180
|
|
11556
12181
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
@@ -11561,8 +12186,11 @@ struct llama_context * llama_new_context_with_model(
|
|
11561
12186
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
11562
12187
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
11563
12188
|
|
11564
|
-
ctx->
|
11565
|
-
ctx->
|
12189
|
+
ctx->abort_callback = params.abort_callback;
|
12190
|
+
ctx->abort_callback_data = params.abort_callback_data;
|
12191
|
+
|
12192
|
+
ctx->rng = std::mt19937(params.seed);
|
12193
|
+
ctx->logits_all = params.logits_all;
|
11566
12194
|
|
11567
12195
|
const ggml_type type_k = params.type_k;
|
11568
12196
|
const ggml_type type_v = params.type_v;
|
@@ -11584,8 +12212,8 @@ struct llama_context * llama_new_context_with_model(
|
|
11584
12212
|
}
|
11585
12213
|
#elif defined(GGML_USE_CUBLAS)
|
11586
12214
|
if (model->n_gpu_layers > 0) {
|
11587
|
-
// with split_mode
|
11588
|
-
if (model->split_mode ==
|
12215
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
12216
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
11589
12217
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
11590
12218
|
if (backend == nullptr) {
|
11591
12219
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
@@ -11594,7 +12222,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11594
12222
|
}
|
11595
12223
|
ctx->backends.push_back(backend);
|
11596
12224
|
} else {
|
11597
|
-
//
|
12225
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
11598
12226
|
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
11599
12227
|
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
11600
12228
|
if (backend == nullptr) {
|
@@ -11620,13 +12248,31 @@ struct llama_context * llama_new_context_with_model(
|
|
11620
12248
|
}
|
11621
12249
|
#elif defined(GGML_USE_SYCL)
|
11622
12250
|
if (model->n_gpu_layers > 0) {
|
11623
|
-
|
11624
|
-
if (
|
11625
|
-
|
11626
|
-
|
11627
|
-
|
12251
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
12252
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
12253
|
+
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
12254
|
+
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
12255
|
+
if (backend == nullptr) {
|
12256
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
12257
|
+
llama_free(ctx);
|
12258
|
+
return nullptr;
|
12259
|
+
}
|
12260
|
+
ctx->backends.push_back(backend);
|
12261
|
+
} else {
|
12262
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
12263
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
12264
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
12265
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
12266
|
+
int device_id = id_list[i];
|
12267
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
12268
|
+
if (backend == nullptr) {
|
12269
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
12270
|
+
llama_free(ctx);
|
12271
|
+
return nullptr;
|
12272
|
+
}
|
12273
|
+
ctx->backends.push_back(backend);
|
12274
|
+
}
|
11628
12275
|
}
|
11629
|
-
ctx->backends.push_back(backend);
|
11630
12276
|
}
|
11631
12277
|
#elif defined(GGML_USE_KOMPUTE)
|
11632
12278
|
if (model->n_gpu_layers > 0) {
|
@@ -11647,8 +12293,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11647
12293
|
}
|
11648
12294
|
ctx->backends.push_back(ctx->backend_cpu);
|
11649
12295
|
|
11650
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
11651
|
-
cparams.n_ctx, cparams.offload_kqv)) {
|
12296
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
|
11652
12297
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
11653
12298
|
llama_free(ctx);
|
11654
12299
|
return nullptr;
|
@@ -11675,8 +12320,8 @@ struct llama_context * llama_new_context_with_model(
|
|
11675
12320
|
// resized during inference, reserve maximum
|
11676
12321
|
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
11677
12322
|
|
11678
|
-
if (params.
|
11679
|
-
ctx->
|
12323
|
+
if (params.embeddings) {
|
12324
|
+
ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
|
11680
12325
|
}
|
11681
12326
|
|
11682
12327
|
// graph inputs
|
@@ -11707,7 +12352,6 @@ struct llama_context * llama_new_context_with_model(
|
|
11707
12352
|
ggml_set_name(ctx->inp_cls, "inp_cls");
|
11708
12353
|
|
11709
12354
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
11710
|
-
|
11711
12355
|
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
11712
12356
|
ggml_backend_buffer_name(ctx->buf_input),
|
11713
12357
|
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
@@ -11727,7 +12371,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11727
12371
|
}
|
11728
12372
|
|
11729
12373
|
// buffer used to store the computation graph and the tensor meta data
|
11730
|
-
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES +
|
12374
|
+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
11731
12375
|
|
11732
12376
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
11733
12377
|
|
@@ -11796,6 +12440,50 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
11796
12440
|
return model->vocab.type;
|
11797
12441
|
}
|
11798
12442
|
|
12443
|
+
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
12444
|
+
switch (model->arch) {
|
12445
|
+
// these models do not use RoPE
|
12446
|
+
case LLM_ARCH_GPT2:
|
12447
|
+
case LLM_ARCH_GPTJ:
|
12448
|
+
case LLM_ARCH_GPTNEOX:
|
12449
|
+
case LLM_ARCH_MPT:
|
12450
|
+
case LLM_ARCH_REFACT:
|
12451
|
+
case LLM_ARCH_BLOOM:
|
12452
|
+
return LLAMA_ROPE_TYPE_NONE;
|
12453
|
+
|
12454
|
+
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
12455
|
+
case LLM_ARCH_LLAMA:
|
12456
|
+
case LLM_ARCH_BAICHUAN:
|
12457
|
+
case LLM_ARCH_STARCODER:
|
12458
|
+
case LLM_ARCH_PLAMO:
|
12459
|
+
case LLM_ARCH_CODESHELL:
|
12460
|
+
case LLM_ARCH_ORION:
|
12461
|
+
case LLM_ARCH_INTERNLM2:
|
12462
|
+
case LLM_ARCH_MINICPM:
|
12463
|
+
return LLAMA_ROPE_TYPE_NORM;
|
12464
|
+
|
12465
|
+
// the pairs of head values are offset by n_rot/2
|
12466
|
+
case LLM_ARCH_FALCON:
|
12467
|
+
case LLM_ARCH_PERSIMMON:
|
12468
|
+
case LLM_ARCH_BERT:
|
12469
|
+
case LLM_ARCH_NOMIC_BERT:
|
12470
|
+
case LLM_ARCH_STABLELM:
|
12471
|
+
case LLM_ARCH_QWEN:
|
12472
|
+
case LLM_ARCH_QWEN2:
|
12473
|
+
case LLM_ARCH_PHI2:
|
12474
|
+
case LLM_ARCH_GEMMA:
|
12475
|
+
case LLM_ARCH_STARCODER2:
|
12476
|
+
return LLAMA_ROPE_TYPE_NEOX;
|
12477
|
+
|
12478
|
+
// all model arches should be listed explicitly here
|
12479
|
+
case LLM_ARCH_UNKNOWN:
|
12480
|
+
GGML_ASSERT(false && "unknown architecture");
|
12481
|
+
break;
|
12482
|
+
}
|
12483
|
+
|
12484
|
+
return LLAMA_ROPE_TYPE_NONE;
|
12485
|
+
}
|
12486
|
+
|
11799
12487
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
11800
12488
|
return model->vocab.id_to_token.size();
|
11801
12489
|
}
|
@@ -11898,15 +12586,6 @@ uint32_t llama_model_quantize(
|
|
11898
12586
|
}
|
11899
12587
|
}
|
11900
12588
|
|
11901
|
-
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
11902
|
-
try {
|
11903
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
11904
|
-
} catch (const std::exception & err) {
|
11905
|
-
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
11906
|
-
return 1;
|
11907
|
-
}
|
11908
|
-
}
|
11909
|
-
|
11910
12589
|
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
11911
12590
|
try {
|
11912
12591
|
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
@@ -12038,12 +12717,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
12038
12717
|
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
12039
12718
|
}
|
12040
12719
|
|
12041
|
-
void
|
12720
|
+
void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
12042
12721
|
if (delta == 0) {
|
12043
12722
|
return;
|
12044
12723
|
}
|
12045
12724
|
|
12046
|
-
|
12725
|
+
llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
|
12047
12726
|
}
|
12048
12727
|
|
12049
12728
|
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
@@ -12054,6 +12733,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
|
|
12054
12733
|
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
12055
12734
|
}
|
12056
12735
|
|
12736
|
+
llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
|
12737
|
+
return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
|
12738
|
+
}
|
12739
|
+
|
12740
|
+
void llama_kv_cache_defrag(struct llama_context * ctx) {
|
12741
|
+
llama_kv_cache_defrag(ctx->kv_self);
|
12742
|
+
}
|
12743
|
+
|
12744
|
+
void llama_kv_cache_update(struct llama_context * ctx) {
|
12745
|
+
llama_kv_cache_update_internal(*ctx);
|
12746
|
+
}
|
12747
|
+
|
12748
|
+
|
12057
12749
|
// Returns the *maximum* size of the state
|
12058
12750
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
12059
12751
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
@@ -12064,10 +12756,15 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
12064
12756
|
// assume worst case for logits although only currently set ones are serialized
|
12065
12757
|
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
12066
12758
|
const size_t s_embedding_size = sizeof(size_t);
|
12067
|
-
const size_t s_embedding = ctx->
|
12068
|
-
const size_t
|
12069
|
-
const size_t
|
12759
|
+
const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
|
12760
|
+
const size_t s_kv_buf_size = sizeof(size_t);
|
12761
|
+
const size_t s_kv_head = sizeof(uint32_t);
|
12762
|
+
const size_t s_kv_size = sizeof(uint32_t);
|
12763
|
+
const size_t s_kv_used = sizeof(uint32_t);
|
12070
12764
|
const size_t s_kv = ctx->kv_self.total_size();
|
12765
|
+
// TODO: assume the max is more than 1 seq_id per KV cell
|
12766
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
12767
|
+
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
12071
12768
|
|
12072
12769
|
const size_t s_total = (
|
12073
12770
|
+ s_rng_size
|
@@ -12076,9 +12773,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
12076
12773
|
+ s_logits
|
12077
12774
|
+ s_embedding_size
|
12078
12775
|
+ s_embedding
|
12776
|
+
+ s_kv_buf_size
|
12777
|
+
+ s_kv_head
|
12079
12778
|
+ s_kv_size
|
12080
|
-
+
|
12779
|
+
+ s_kv_used
|
12081
12780
|
+ s_kv
|
12781
|
+
+ s_kv_cells
|
12082
12782
|
);
|
12083
12783
|
|
12084
12784
|
return s_total;
|
@@ -12165,12 +12865,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12165
12865
|
|
12166
12866
|
// copy embeddings
|
12167
12867
|
{
|
12168
|
-
const size_t
|
12868
|
+
const size_t embeddings_size = ctx->embd.size();
|
12169
12869
|
|
12170
|
-
data_ctx->write(&
|
12870
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
12171
12871
|
|
12172
|
-
if (
|
12173
|
-
data_ctx->write(ctx->
|
12872
|
+
if (embeddings_size) {
|
12873
|
+
data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
|
12174
12874
|
}
|
12175
12875
|
}
|
12176
12876
|
|
@@ -12178,15 +12878,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12178
12878
|
{
|
12179
12879
|
const auto & kv_self = ctx->kv_self;
|
12180
12880
|
const auto & hparams = ctx->model.hparams;
|
12181
|
-
const auto & cparams = ctx->cparams;
|
12182
12881
|
|
12183
|
-
const
|
12184
|
-
const
|
12185
|
-
const
|
12186
|
-
const auto n_ctx = cparams.n_ctx;
|
12882
|
+
const uint32_t n_layer = hparams.n_layer;
|
12883
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12884
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12187
12885
|
|
12188
12886
|
const size_t kv_buf_size = kv_self.total_size();
|
12189
|
-
const uint32_t kv_head = kv_self
|
12887
|
+
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
12190
12888
|
const uint32_t kv_size = kv_self.size;
|
12191
12889
|
const uint32_t kv_used = kv_self.used;
|
12192
12890
|
|
@@ -12198,14 +12896,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12198
12896
|
if (kv_buf_size) {
|
12199
12897
|
std::vector<uint8_t> tmp_buf;
|
12200
12898
|
for (int il = 0; il < (int) n_layer; ++il) {
|
12201
|
-
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12899
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12900
|
+
|
12202
12901
|
tmp_buf.resize(k_size);
|
12203
12902
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
12204
12903
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
12205
12904
|
|
12206
12905
|
// v is not contiguous, copy row by row
|
12207
|
-
size_t v_row_size
|
12208
|
-
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
12906
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12907
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
12908
|
+
|
12209
12909
|
tmp_buf.resize(v_row_size);
|
12210
12910
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12211
12911
|
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
@@ -12214,7 +12914,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12214
12914
|
}
|
12215
12915
|
}
|
12216
12916
|
|
12217
|
-
for (uint32_t i = 0; i <
|
12917
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
12218
12918
|
const auto & cell = kv_self.cells[i];
|
12219
12919
|
|
12220
12920
|
const llama_pos pos = cell.pos;
|
@@ -12238,8 +12938,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
12238
12938
|
}
|
12239
12939
|
|
12240
12940
|
// Sets the state reading from the specified source address
|
12241
|
-
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
12242
|
-
uint8_t * inp = src;
|
12941
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
12942
|
+
const uint8_t * inp = src;
|
12243
12943
|
|
12244
12944
|
// set rng
|
12245
12945
|
{
|
@@ -12248,7 +12948,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12248
12948
|
|
12249
12949
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
12250
12950
|
|
12251
|
-
std::string rng_str((char *)inp, rng_size); inp += rng_size;
|
12951
|
+
std::string rng_str((const char *)inp, rng_size); inp += rng_size;
|
12252
12952
|
|
12253
12953
|
std::istringstream rng_ss(rng_str);
|
12254
12954
|
rng_ss >> ctx->rng;
|
@@ -12274,15 +12974,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12274
12974
|
|
12275
12975
|
// set embeddings
|
12276
12976
|
{
|
12277
|
-
size_t
|
12977
|
+
size_t embeddings_size;
|
12978
|
+
|
12979
|
+
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
12278
12980
|
|
12279
|
-
|
12981
|
+
GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
|
12280
12982
|
|
12281
|
-
|
12983
|
+
if (embeddings_size) {
|
12984
|
+
ctx->embd.resize(embeddings_size);
|
12282
12985
|
|
12283
|
-
|
12284
|
-
|
12285
|
-
inp += embedding_size * sizeof(float);
|
12986
|
+
memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
|
12987
|
+
inp += embeddings_size * sizeof(float);
|
12286
12988
|
}
|
12287
12989
|
}
|
12288
12990
|
|
@@ -12290,12 +12992,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12290
12992
|
{
|
12291
12993
|
const auto & kv_self = ctx->kv_self;
|
12292
12994
|
const auto & hparams = ctx->model.hparams;
|
12293
|
-
const auto & cparams = ctx->cparams;
|
12294
12995
|
|
12295
|
-
const
|
12296
|
-
const
|
12297
|
-
const
|
12298
|
-
const int n_ctx = cparams.n_ctx;
|
12996
|
+
const uint32_t n_layer = hparams.n_layer;
|
12997
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12998
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12299
12999
|
|
12300
13000
|
size_t kv_buf_size;
|
12301
13001
|
uint32_t kv_head;
|
@@ -12311,13 +13011,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12311
13011
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
12312
13012
|
|
12313
13013
|
for (int il = 0; il < (int) n_layer; ++il) {
|
12314
|
-
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
13014
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
13015
|
+
|
12315
13016
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
12316
13017
|
inp += k_size;
|
12317
13018
|
|
12318
13019
|
// v is not contiguous, copy row by row
|
12319
|
-
size_t v_row_size
|
12320
|
-
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
13020
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
13021
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
13022
|
+
|
12321
13023
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12322
13024
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
12323
13025
|
inp += v_row_size;
|
@@ -12325,13 +13027,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12325
13027
|
}
|
12326
13028
|
}
|
12327
13029
|
|
13030
|
+
GGML_ASSERT(kv_self.size == kv_size);
|
13031
|
+
|
12328
13032
|
ctx->kv_self.head = kv_head;
|
12329
13033
|
ctx->kv_self.size = kv_size;
|
12330
13034
|
ctx->kv_self.used = kv_used;
|
12331
13035
|
|
12332
13036
|
ctx->kv_self.cells.resize(kv_size);
|
12333
13037
|
|
12334
|
-
for (uint32_t i = 0; i <
|
13038
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
12335
13039
|
llama_pos pos;
|
12336
13040
|
size_t seq_id_size;
|
12337
13041
|
|
@@ -12347,6 +13051,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12347
13051
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
12348
13052
|
}
|
12349
13053
|
}
|
13054
|
+
|
13055
|
+
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
13056
|
+
ctx->kv_self.cells[i].pos = -1;
|
13057
|
+
ctx->kv_self.cells[i].seq_id.clear();
|
13058
|
+
}
|
12350
13059
|
}
|
12351
13060
|
|
12352
13061
|
const size_t nread = inp - src;
|
@@ -12439,43 +13148,16 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
12439
13148
|
return true;
|
12440
13149
|
}
|
12441
13150
|
|
12442
|
-
int llama_eval(
|
12443
|
-
struct llama_context * ctx,
|
12444
|
-
llama_token * tokens,
|
12445
|
-
int32_t n_tokens,
|
12446
|
-
int32_t n_past) {
|
12447
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
12448
|
-
|
12449
|
-
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
12450
|
-
if (ret < 0) {
|
12451
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
12452
|
-
}
|
12453
|
-
|
12454
|
-
return ret;
|
12455
|
-
}
|
12456
|
-
|
12457
|
-
int llama_eval_embd(
|
12458
|
-
struct llama_context * ctx,
|
12459
|
-
float * embd,
|
12460
|
-
int32_t n_tokens,
|
12461
|
-
int32_t n_past) {
|
12462
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
12463
|
-
|
12464
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
12465
|
-
|
12466
|
-
const int ret = llama_decode_internal(*ctx, batch);
|
12467
|
-
if (ret < 0) {
|
12468
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
12469
|
-
}
|
12470
|
-
|
12471
|
-
return ret;
|
12472
|
-
}
|
12473
|
-
|
12474
13151
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
12475
13152
|
ctx->cparams.n_threads = n_threads;
|
12476
13153
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
12477
13154
|
}
|
12478
13155
|
|
13156
|
+
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
13157
|
+
ctx->abort_callback = abort_callback;
|
13158
|
+
ctx->abort_callback_data = abort_callback_data;
|
13159
|
+
}
|
13160
|
+
|
12479
13161
|
struct llama_batch llama_batch_get_one(
|
12480
13162
|
llama_token * tokens,
|
12481
13163
|
int32_t n_tokens,
|
@@ -12552,11 +13234,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|
12552
13234
|
}
|
12553
13235
|
|
12554
13236
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
12555
|
-
return ctx->
|
13237
|
+
return ctx->embd.data();
|
12556
13238
|
}
|
12557
13239
|
|
12558
13240
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
12559
|
-
return ctx->
|
13241
|
+
return ctx->embd.data() + i*ctx->model.hparams.n_embd;
|
13242
|
+
}
|
13243
|
+
|
13244
|
+
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
13245
|
+
auto it = ctx->embd_seq.find(seq_id);
|
13246
|
+
if (it == ctx->embd_seq.end()) {
|
13247
|
+
return nullptr;
|
13248
|
+
}
|
13249
|
+
|
13250
|
+
return it->second.data();
|
12560
13251
|
}
|
12561
13252
|
|
12562
13253
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
@@ -12730,7 +13421,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12730
13421
|
std::string & dest, bool add_ass) {
|
12731
13422
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
12732
13423
|
std::stringstream ss;
|
12733
|
-
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
13424
|
+
if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
|
12734
13425
|
// chatml template
|
12735
13426
|
for (auto message : chat) {
|
12736
13427
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
@@ -12738,7 +13429,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12738
13429
|
if (add_ass) {
|
12739
13430
|
ss << "<|im_start|>assistant\n";
|
12740
13431
|
}
|
12741
|
-
} else if (tmpl.find("[INST]") != std::string::npos) {
|
13432
|
+
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
|
12742
13433
|
// llama2 template and its variants
|
12743
13434
|
// [variant] support system message
|
12744
13435
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
@@ -12773,7 +13464,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12773
13464
|
}
|
12774
13465
|
}
|
12775
13466
|
// llama2 templates seem to not care about "add_generation_prompt"
|
12776
|
-
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
13467
|
+
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
12777
13468
|
// zephyr template
|
12778
13469
|
for (auto message : chat) {
|
12779
13470
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
@@ -12781,7 +13472,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12781
13472
|
if (add_ass) {
|
12782
13473
|
ss << "<|assistant|>\n";
|
12783
13474
|
}
|
12784
|
-
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
13475
|
+
} else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
|
12785
13476
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
12786
13477
|
for (auto message : chat) {
|
12787
13478
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
@@ -12790,7 +13481,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
12790
13481
|
if (add_ass) {
|
12791
13482
|
ss << "<s>assistant\n";
|
12792
13483
|
}
|
12793
|
-
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
13484
|
+
} else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
12794
13485
|
// google/gemma-7b-it
|
12795
13486
|
std::string system_prompt = "";
|
12796
13487
|
for (auto message : chat) {
|
@@ -12837,23 +13528,27 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
12837
13528
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
12838
13529
|
if (res < 0) {
|
12839
13530
|
// worst case: there is no information about template, we will use chatml by default
|
12840
|
-
curr_tmpl = "
|
13531
|
+
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
12841
13532
|
} else {
|
12842
13533
|
curr_tmpl = std::string(model_template.data(), model_template.size());
|
12843
13534
|
}
|
12844
13535
|
}
|
13536
|
+
|
12845
13537
|
// format the chat to string
|
12846
13538
|
std::vector<const llama_chat_message *> chat_vec;
|
12847
13539
|
chat_vec.resize(n_msg);
|
12848
13540
|
for (size_t i = 0; i < n_msg; i++) {
|
12849
13541
|
chat_vec[i] = &chat[i];
|
12850
13542
|
}
|
13543
|
+
|
12851
13544
|
std::string formatted_chat;
|
12852
13545
|
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
12853
13546
|
if (res < 0) {
|
12854
13547
|
return res;
|
12855
13548
|
}
|
12856
|
-
|
13549
|
+
if (buf && length > 0) {
|
13550
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
13551
|
+
}
|
12857
13552
|
return res;
|
12858
13553
|
}
|
12859
13554
|
|