llama_cpp 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +59 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -4
- data/vendor/tmp/llama.cpp/Makefile +2 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +4 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -21
- data/vendor/tmp/llama.cpp/ggml-backend.h +16 -15
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +949 -168
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +63 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +120 -75
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +178 -133
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3432 -1118
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1327 -773
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +227 -15
- data/vendor/tmp/llama.cpp/ggml.h +30 -4
- data/vendor/tmp/llama.cpp/llama.cpp +631 -211
- data/vendor/tmp/llama.cpp/llama.h +28 -10
- metadata +2 -2
|
@@ -104,6 +104,7 @@
|
|
|
104
104
|
#define LLAMA_MAX_NODES 8192
|
|
105
105
|
#define LLAMA_MAX_EXPERTS 8
|
|
106
106
|
|
|
107
|
+
|
|
107
108
|
//
|
|
108
109
|
// logging
|
|
109
110
|
//
|
|
@@ -211,10 +212,11 @@ enum llm_arch {
|
|
|
211
212
|
LLM_ARCH_INTERNLM2,
|
|
212
213
|
LLM_ARCH_MINICPM,
|
|
213
214
|
LLM_ARCH_GEMMA,
|
|
215
|
+
LLM_ARCH_STARCODER2,
|
|
214
216
|
LLM_ARCH_UNKNOWN,
|
|
215
217
|
};
|
|
216
218
|
|
|
217
|
-
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
219
|
+
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
218
220
|
{ LLM_ARCH_LLAMA, "llama" },
|
|
219
221
|
{ LLM_ARCH_FALCON, "falcon" },
|
|
220
222
|
{ LLM_ARCH_GPT2, "gpt2" },
|
|
@@ -238,6 +240,8 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
238
240
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
|
239
241
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
|
240
242
|
{ LLM_ARCH_GEMMA, "gemma" },
|
|
243
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
244
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
241
245
|
};
|
|
242
246
|
|
|
243
247
|
enum llm_kv {
|
|
@@ -298,7 +302,7 @@ enum llm_kv {
|
|
|
298
302
|
LLM_KV_TOKENIZER_RWKV,
|
|
299
303
|
};
|
|
300
304
|
|
|
301
|
-
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
305
|
+
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
302
306
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
|
303
307
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
|
304
308
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
|
@@ -362,7 +366,7 @@ struct LLM_KV {
|
|
|
362
366
|
llm_arch arch;
|
|
363
367
|
|
|
364
368
|
std::string operator()(llm_kv kv) const {
|
|
365
|
-
return ::format(LLM_KV_NAMES
|
|
369
|
+
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
|
366
370
|
}
|
|
367
371
|
};
|
|
368
372
|
|
|
@@ -397,7 +401,7 @@ enum llm_tensor {
|
|
|
397
401
|
LLM_TENSOR_LAYER_OUT_NORM,
|
|
398
402
|
};
|
|
399
403
|
|
|
400
|
-
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
404
|
+
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
401
405
|
{
|
|
402
406
|
LLM_ARCH_LLAMA,
|
|
403
407
|
{
|
|
@@ -779,6 +783,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
|
779
783
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
780
784
|
},
|
|
781
785
|
},
|
|
786
|
+
{
|
|
787
|
+
LLM_ARCH_STARCODER2,
|
|
788
|
+
{
|
|
789
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
790
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
791
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
792
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
793
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
794
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
795
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
796
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
797
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
798
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
799
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
800
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
801
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
802
|
+
},
|
|
803
|
+
},
|
|
782
804
|
{
|
|
783
805
|
LLM_ARCH_UNKNOWN,
|
|
784
806
|
{
|
|
@@ -812,38 +834,38 @@ struct LLM_TN {
|
|
|
812
834
|
llm_arch arch;
|
|
813
835
|
|
|
814
836
|
std::string operator()(llm_tensor tensor) const {
|
|
815
|
-
if (LLM_TENSOR_NAMES
|
|
837
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
|
816
838
|
return "__missing__";
|
|
817
839
|
}
|
|
818
|
-
return LLM_TENSOR_NAMES
|
|
840
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
|
819
841
|
}
|
|
820
842
|
|
|
821
843
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
|
822
|
-
if (LLM_TENSOR_NAMES
|
|
844
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
|
823
845
|
return "__missing__";
|
|
824
846
|
}
|
|
825
|
-
return LLM_TENSOR_NAMES
|
|
847
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
|
826
848
|
}
|
|
827
849
|
|
|
828
850
|
std::string operator()(llm_tensor tensor, int bid) const {
|
|
829
|
-
if (LLM_TENSOR_NAMES
|
|
851
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
|
830
852
|
return "__missing__";
|
|
831
853
|
}
|
|
832
|
-
return ::format(LLM_TENSOR_NAMES
|
|
854
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
|
|
833
855
|
}
|
|
834
856
|
|
|
835
857
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
|
836
|
-
if (LLM_TENSOR_NAMES
|
|
858
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
|
837
859
|
return "__missing__";
|
|
838
860
|
}
|
|
839
|
-
return ::format(LLM_TENSOR_NAMES
|
|
861
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
|
|
840
862
|
}
|
|
841
863
|
|
|
842
864
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
|
843
|
-
if (LLM_TENSOR_NAMES
|
|
865
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
|
844
866
|
return "__missing__";
|
|
845
867
|
}
|
|
846
|
-
return ::format(LLM_TENSOR_NAMES
|
|
868
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
|
|
847
869
|
}
|
|
848
870
|
};
|
|
849
871
|
|
|
@@ -851,16 +873,16 @@ struct LLM_TN {
|
|
|
851
873
|
// gguf helpers
|
|
852
874
|
//
|
|
853
875
|
|
|
854
|
-
static std::map<
|
|
876
|
+
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
|
855
877
|
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
|
856
878
|
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
|
857
879
|
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
|
858
880
|
};
|
|
859
881
|
|
|
860
|
-
static
|
|
882
|
+
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
861
883
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
|
862
884
|
if (kv.second == name) {
|
|
863
|
-
return kv.first;
|
|
885
|
+
return (llama_rope_scaling_type) kv.first;
|
|
864
886
|
}
|
|
865
887
|
}
|
|
866
888
|
|
|
@@ -1409,7 +1431,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
|
1409
1431
|
buft = ggml_backend_cuda_host_buffer_type();
|
|
1410
1432
|
}
|
|
1411
1433
|
#elif defined(GGML_USE_SYCL)
|
|
1412
|
-
|
|
1434
|
+
if (host_buffer) {
|
|
1435
|
+
buft = ggml_backend_sycl_host_buffer_type();
|
|
1436
|
+
}
|
|
1413
1437
|
#elif defined(GGML_USE_CPU_HBM)
|
|
1414
1438
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
|
1415
1439
|
#elif defined(GGML_USE_VULKAN)
|
|
@@ -1463,6 +1487,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
|
1463
1487
|
}
|
|
1464
1488
|
#endif
|
|
1465
1489
|
|
|
1490
|
+
#ifdef GGML_USE_SYCL
|
|
1491
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
|
1492
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
|
1493
|
+
}
|
|
1494
|
+
#endif
|
|
1495
|
+
|
|
1466
1496
|
if (buft == nullptr) {
|
|
1467
1497
|
buft = llama_default_buffer_type_offload(fallback_gpu);
|
|
1468
1498
|
}
|
|
@@ -1474,6 +1504,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
|
1474
1504
|
static size_t llama_get_device_count() {
|
|
1475
1505
|
#if defined(GGML_USE_CUBLAS)
|
|
1476
1506
|
return ggml_backend_cuda_get_device_count();
|
|
1507
|
+
#elif defined(GGML_USE_SYCL)
|
|
1508
|
+
return ggml_backend_sycl_get_device_count();
|
|
1477
1509
|
#elif defined(GGML_USE_VULKAN)
|
|
1478
1510
|
return ggml_backend_vk_get_device_count();
|
|
1479
1511
|
#else
|
|
@@ -1487,6 +1519,11 @@ static size_t llama_get_device_memory(int device) {
|
|
|
1487
1519
|
size_t free;
|
|
1488
1520
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
|
1489
1521
|
return free;
|
|
1522
|
+
#elif defined(GGML_USE_SYCL)
|
|
1523
|
+
size_t total;
|
|
1524
|
+
size_t free;
|
|
1525
|
+
ggml_backend_sycl_get_device_memory(device, &total, &free);
|
|
1526
|
+
return free;
|
|
1490
1527
|
#elif defined(GGML_USE_VULKAN)
|
|
1491
1528
|
size_t total;
|
|
1492
1529
|
size_t free;
|
|
@@ -1575,7 +1612,6 @@ struct llama_hparams {
|
|
|
1575
1612
|
float rope_freq_base_train;
|
|
1576
1613
|
float rope_freq_scale_train;
|
|
1577
1614
|
uint32_t n_yarn_orig_ctx;
|
|
1578
|
-
int32_t rope_scaling_type_train;
|
|
1579
1615
|
|
|
1580
1616
|
float f_clamp_kqv = 0.0f;
|
|
1581
1617
|
float f_max_alibi_bias = 0.0f;
|
|
@@ -1583,8 +1619,9 @@ struct llama_hparams {
|
|
|
1583
1619
|
bool causal_attn = true;
|
|
1584
1620
|
bool need_kq_pos = false;
|
|
1585
1621
|
|
|
1586
|
-
enum llama_pooling_type
|
|
1587
|
-
enum llama_rope_type
|
|
1622
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
1623
|
+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
1624
|
+
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
|
1588
1625
|
|
|
1589
1626
|
bool operator!=(const llama_hparams & other) const {
|
|
1590
1627
|
if (this->vocab_only != other.vocab_only) return true;
|
|
@@ -1628,13 +1665,13 @@ struct llama_hparams {
|
|
|
1628
1665
|
};
|
|
1629
1666
|
|
|
1630
1667
|
struct llama_cparams {
|
|
1631
|
-
uint32_t n_ctx;
|
|
1668
|
+
uint32_t n_ctx; // context size used during inference
|
|
1632
1669
|
uint32_t n_batch;
|
|
1633
1670
|
uint32_t n_threads; // number of threads to use for generation
|
|
1634
1671
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
|
1635
1672
|
|
|
1636
|
-
float
|
|
1637
|
-
float
|
|
1673
|
+
float rope_freq_base;
|
|
1674
|
+
float rope_freq_scale;
|
|
1638
1675
|
|
|
1639
1676
|
uint32_t n_yarn_orig_ctx;
|
|
1640
1677
|
// These hyperparameters are not exposed in GGUF, because all
|
|
@@ -1645,8 +1682,10 @@ struct llama_cparams {
|
|
|
1645
1682
|
float yarn_beta_slow;
|
|
1646
1683
|
float defrag_thold;
|
|
1647
1684
|
|
|
1685
|
+
bool embeddings;
|
|
1648
1686
|
bool offload_kqv;
|
|
1649
|
-
|
|
1687
|
+
|
|
1688
|
+
enum llama_pooling_type pooling_type;
|
|
1650
1689
|
|
|
1651
1690
|
ggml_backend_sched_eval_callback cb_eval;
|
|
1652
1691
|
void * cb_eval_user_data;
|
|
@@ -1935,7 +1974,7 @@ struct llama_context {
|
|
|
1935
1974
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
|
1936
1975
|
int32_t n_eval = 0; // number of eval calls
|
|
1937
1976
|
|
|
1938
|
-
//
|
|
1977
|
+
// logits output (2-dimensional array: [n_tokens][n_vocab])
|
|
1939
1978
|
std::vector<float> logits;
|
|
1940
1979
|
#ifndef NDEBUG
|
|
1941
1980
|
// guard against access to unset logits
|
|
@@ -1943,13 +1982,21 @@ struct llama_context {
|
|
|
1943
1982
|
#endif
|
|
1944
1983
|
bool logits_all = false;
|
|
1945
1984
|
|
|
1946
|
-
//
|
|
1947
|
-
|
|
1985
|
+
// embeddings output (2-dimensional array: [n_tokens][n_embd])
|
|
1986
|
+
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
|
1987
|
+
std::vector<float> embd;
|
|
1988
|
+
|
|
1989
|
+
// sequence embeddings output (map of [n_embd] vectors)
|
|
1990
|
+
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
|
1991
|
+
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
|
1948
1992
|
|
|
1949
1993
|
// memory buffers used to evaluate the model
|
|
1950
1994
|
std::vector<uint8_t> buf_compute_meta;
|
|
1951
1995
|
ggml_backend_sched_t sched = nullptr;
|
|
1952
1996
|
|
|
1997
|
+
ggml_abort_callback abort_callback = nullptr;
|
|
1998
|
+
void * abort_callback_data = nullptr;
|
|
1999
|
+
|
|
1953
2000
|
// input tensors
|
|
1954
2001
|
ggml_backend_buffer_t buf_input = nullptr;
|
|
1955
2002
|
ggml_context * ctx_input = nullptr;
|
|
@@ -2116,10 +2163,12 @@ static bool llama_kv_cache_find_slot(
|
|
|
2116
2163
|
}
|
|
2117
2164
|
|
|
2118
2165
|
// find how many cells are currently in use
|
|
2119
|
-
static
|
|
2120
|
-
for (uint32_t i = cache.size
|
|
2121
|
-
|
|
2122
|
-
|
|
2166
|
+
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
|
2167
|
+
for (uint32_t i = cache.size; i > 0; --i) {
|
|
2168
|
+
const llama_kv_cell & cell = cache.cells[i - 1];
|
|
2169
|
+
|
|
2170
|
+
if (cell.pos >= 0 && !cell.is_empty()) {
|
|
2171
|
+
return i;
|
|
2123
2172
|
}
|
|
2124
2173
|
}
|
|
2125
2174
|
|
|
@@ -2891,7 +2940,11 @@ template<>
|
|
|
2891
2940
|
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
|
2892
2941
|
uint32_t tmp;
|
|
2893
2942
|
const bool found = get_key(kid, tmp, required);
|
|
2894
|
-
|
|
2943
|
+
if (found) {
|
|
2944
|
+
result = (enum llama_pooling_type) tmp;
|
|
2945
|
+
} else {
|
|
2946
|
+
result = LLAMA_POOLING_TYPE_UNSPECIFIED;
|
|
2947
|
+
}
|
|
2895
2948
|
return found;
|
|
2896
2949
|
}
|
|
2897
2950
|
|
|
@@ -3168,7 +3221,7 @@ static void llm_load_hparams(
|
|
|
3168
3221
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
3169
3222
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
3170
3223
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
3171
|
-
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
3224
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
3172
3225
|
|
|
3173
3226
|
switch (hparams.n_layer) {
|
|
3174
3227
|
case 3:
|
|
@@ -3320,6 +3373,16 @@ static void llm_load_hparams(
|
|
|
3320
3373
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3321
3374
|
}
|
|
3322
3375
|
} break;
|
|
3376
|
+
case LLM_ARCH_STARCODER2:
|
|
3377
|
+
{
|
|
3378
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
3379
|
+
switch (hparams.n_layer) {
|
|
3380
|
+
case 30: model.type = e_model::MODEL_3B; break;
|
|
3381
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
|
3382
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
|
3383
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3384
|
+
}
|
|
3385
|
+
} break;
|
|
3323
3386
|
default: (void)0;
|
|
3324
3387
|
}
|
|
3325
3388
|
|
|
@@ -4490,6 +4553,56 @@ static bool llm_load_tensors(
|
|
|
4490
4553
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
4491
4554
|
}
|
|
4492
4555
|
} break;
|
|
4556
|
+
case LLM_ARCH_STARCODER2:
|
|
4557
|
+
{
|
|
4558
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
4559
|
+
|
|
4560
|
+
// output
|
|
4561
|
+
{
|
|
4562
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
4563
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
4564
|
+
|
|
4565
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
|
4566
|
+
// if output is NULL, init from the input tok embed
|
|
4567
|
+
if (model.output == NULL) {
|
|
4568
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
4569
|
+
ml.n_created--; // artificial tensor
|
|
4570
|
+
ml.size_data += ggml_nbytes(model.output);
|
|
4571
|
+
}
|
|
4572
|
+
|
|
4573
|
+
}
|
|
4574
|
+
|
|
4575
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4576
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
4577
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
4578
|
+
|
|
4579
|
+
auto & layer = model.layers[i];
|
|
4580
|
+
|
|
4581
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
4582
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
4583
|
+
|
|
4584
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
4585
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
4586
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
4587
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
4588
|
+
|
|
4589
|
+
// optional bias tensors
|
|
4590
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
|
4591
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
|
4592
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
|
4593
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
4594
|
+
|
|
4595
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
4596
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
4597
|
+
|
|
4598
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
4599
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
4600
|
+
|
|
4601
|
+
// optional bias tensors
|
|
4602
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
4603
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
|
|
4604
|
+
}
|
|
4605
|
+
} break;
|
|
4493
4606
|
default:
|
|
4494
4607
|
throw std::runtime_error("unknown architecture");
|
|
4495
4608
|
}
|
|
@@ -4901,8 +5014,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
4901
5014
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
4902
5015
|
}
|
|
4903
5016
|
|
|
4904
|
-
#if defined(
|
|
4905
|
-
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for
|
|
5017
|
+
#if defined(GGML_USE_KOMPUTE)
|
|
5018
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
|
4906
5019
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
|
4907
5020
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
|
4908
5021
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
|
@@ -4986,6 +5099,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
4986
5099
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
|
4987
5100
|
|
|
4988
5101
|
struct ggml_tensor * cur;
|
|
5102
|
+
|
|
4989
5103
|
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
|
4990
5104
|
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
|
4991
5105
|
cb(cur, "kqv_out", il);
|
|
@@ -5073,7 +5187,7 @@ struct llm_build_context {
|
|
|
5073
5187
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
|
5074
5188
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
|
5075
5189
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
|
5076
|
-
pooling_type (cparams.
|
|
5190
|
+
pooling_type (cparams.pooling_type),
|
|
5077
5191
|
rope_type (hparams.rope_type),
|
|
5078
5192
|
cb (cb),
|
|
5079
5193
|
buf_compute_meta (lctx.buf_compute_meta) {
|
|
@@ -5979,6 +6093,7 @@ struct llm_build_context {
|
|
|
5979
6093
|
|
|
5980
6094
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5981
6095
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6096
|
+
|
|
5982
6097
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
5983
6098
|
|
|
5984
6099
|
struct ggml_tensor * cur;
|
|
@@ -5986,9 +6101,10 @@ struct llm_build_context {
|
|
|
5986
6101
|
|
|
5987
6102
|
// get input vectors with right size
|
|
5988
6103
|
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
|
5989
|
-
|
|
6104
|
+
|
|
6105
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
|
5990
6106
|
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
|
5991
|
-
struct ggml_tensor * inp_cls
|
|
6107
|
+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
|
5992
6108
|
|
|
5993
6109
|
// construct input embeddings (token, type, position)
|
|
5994
6110
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
|
@@ -6006,39 +6122,38 @@ struct llm_build_context {
|
|
|
6006
6122
|
cb(inpL, "inp_norm", -1);
|
|
6007
6123
|
|
|
6008
6124
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
6009
|
-
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask,
|
|
6010
|
-
cb(KQ_mask, "KQ_mask", -1); // [
|
|
6125
|
+
struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
|
|
6126
|
+
cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
|
|
6011
6127
|
|
|
6012
6128
|
// iterate layers
|
|
6013
6129
|
for (int il = 0; il < n_layer; ++il) {
|
|
6014
6130
|
struct ggml_tensor * cur = inpL;
|
|
6015
6131
|
|
|
6132
|
+
struct ggml_tensor * Qcur;
|
|
6133
|
+
struct ggml_tensor * Kcur;
|
|
6134
|
+
struct ggml_tensor * Vcur;
|
|
6135
|
+
|
|
6016
6136
|
// self-attention
|
|
6017
6137
|
if (model.arch == LLM_ARCH_BERT) {
|
|
6018
|
-
|
|
6138
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
|
6019
6139
|
cb(Qcur, "Qcur", il);
|
|
6020
6140
|
|
|
6021
|
-
|
|
6141
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
|
6022
6142
|
cb(Kcur, "Kcur", il);
|
|
6023
6143
|
|
|
6024
|
-
|
|
6144
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
|
6025
6145
|
cb(Vcur, "Vcur", il);
|
|
6026
6146
|
|
|
6027
|
-
|
|
6028
|
-
|
|
6029
|
-
|
|
6030
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
6031
|
-
model.layers[il].wo, model.layers[il].bo,
|
|
6032
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
6033
|
-
cb(cur, "kqv_out", il);
|
|
6147
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6148
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6034
6149
|
} else {
|
|
6035
6150
|
// compute Q and K and RoPE them
|
|
6036
6151
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
|
6037
6152
|
cb(cur, "wqkv", il);
|
|
6038
6153
|
|
|
6039
|
-
|
|
6040
|
-
|
|
6041
|
-
|
|
6154
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6155
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6156
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6042
6157
|
|
|
6043
6158
|
cb(Qcur, "Qcur", il);
|
|
6044
6159
|
cb(Kcur, "Kcur", il);
|
|
@@ -6057,13 +6172,41 @@ struct llm_build_context {
|
|
|
6057
6172
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6058
6173
|
);
|
|
6059
6174
|
cb(Kcur, "Kcur", il);
|
|
6175
|
+
}
|
|
6060
6176
|
|
|
6061
|
-
|
|
6062
|
-
|
|
6063
|
-
|
|
6064
|
-
|
|
6177
|
+
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
6178
|
+
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
|
6179
|
+
|
|
6180
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
6181
|
+
cb(kq, "kq", il);
|
|
6182
|
+
|
|
6183
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
|
6184
|
+
cb(kq, "kq_soft_max_ext", il);
|
|
6185
|
+
|
|
6186
|
+
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
|
6187
|
+
cb(v, "v", il);
|
|
6188
|
+
|
|
6189
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
|
|
6190
|
+
cb(kqv, "kqv", il);
|
|
6191
|
+
|
|
6192
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
|
6193
|
+
cb(kqv_merged, "kqv_merged", il);
|
|
6194
|
+
|
|
6195
|
+
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
|
6196
|
+
cb(cur, "kqv_merged_cont", il);
|
|
6197
|
+
|
|
6198
|
+
ggml_build_forward_expand(gf, cur);
|
|
6199
|
+
|
|
6200
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
|
6201
|
+
if (model.layers[il].bo) {
|
|
6202
|
+
cb(cur, "kqv_wo", il);
|
|
6065
6203
|
}
|
|
6066
6204
|
|
|
6205
|
+
if (model.layers[il].bo) {
|
|
6206
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
|
6207
|
+
}
|
|
6208
|
+
cb(cur, "kqv_out", il);
|
|
6209
|
+
|
|
6067
6210
|
// re-add the layer input
|
|
6068
6211
|
cur = ggml_add(ctx0, cur, inpL);
|
|
6069
6212
|
|
|
@@ -6103,16 +6246,29 @@ struct llm_build_context {
|
|
|
6103
6246
|
|
|
6104
6247
|
// final output
|
|
6105
6248
|
cur = inpL;
|
|
6249
|
+
cb(cur, "result_embd", -1);
|
|
6106
6250
|
|
|
6107
6251
|
// pooling layer
|
|
6108
|
-
|
|
6109
|
-
|
|
6110
|
-
|
|
6111
|
-
|
|
6112
|
-
|
|
6113
|
-
|
|
6252
|
+
switch (pooling_type) {
|
|
6253
|
+
case LLAMA_POOLING_TYPE_NONE:
|
|
6254
|
+
{
|
|
6255
|
+
// nop
|
|
6256
|
+
} break;
|
|
6257
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
|
6258
|
+
{
|
|
6259
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
|
6260
|
+
cb(cur, "result_embd_pooled", -1);
|
|
6261
|
+
} break;
|
|
6262
|
+
case LLAMA_POOLING_TYPE_CLS:
|
|
6263
|
+
{
|
|
6264
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
|
6265
|
+
cb(cur, "result_embd_pooled", -1);
|
|
6266
|
+
} break;
|
|
6267
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
|
6268
|
+
{
|
|
6269
|
+
GGML_ASSERT(false && "Invalid pooling type");
|
|
6270
|
+
} break;
|
|
6114
6271
|
}
|
|
6115
|
-
cb(cur, "result_embd", -1);
|
|
6116
6272
|
|
|
6117
6273
|
ggml_build_forward_expand(gf, cur);
|
|
6118
6274
|
|
|
@@ -7559,6 +7715,120 @@ struct llm_build_context {
|
|
|
7559
7715
|
|
|
7560
7716
|
return gf;
|
|
7561
7717
|
}
|
|
7718
|
+
|
|
7719
|
+
struct ggml_cgraph * build_starcoder2() {
|
|
7720
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
7721
|
+
|
|
7722
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7723
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7724
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
7725
|
+
|
|
7726
|
+
struct ggml_tensor * cur;
|
|
7727
|
+
struct ggml_tensor * inpL;
|
|
7728
|
+
|
|
7729
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
|
7730
|
+
cb(inpL, "inp_embd", -1);
|
|
7731
|
+
|
|
7732
|
+
// inp_pos - contains the positions
|
|
7733
|
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
|
7734
|
+
cb(inp_pos, "inp_pos", -1);
|
|
7735
|
+
|
|
7736
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
7737
|
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
|
7738
|
+
cb(KQ_mask, "KQ_mask", -1);
|
|
7739
|
+
|
|
7740
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7741
|
+
struct ggml_tensor * inpSA = inpL;
|
|
7742
|
+
|
|
7743
|
+
// norm
|
|
7744
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
7745
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
|
7746
|
+
LLM_NORM, cb, il);
|
|
7747
|
+
cb(cur, "attn_norm", il);
|
|
7748
|
+
|
|
7749
|
+
// self-attention
|
|
7750
|
+
{
|
|
7751
|
+
// compute Q and K and RoPE them
|
|
7752
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
7753
|
+
cb(Qcur, "Qcur", il);
|
|
7754
|
+
if (model.layers[il].bq) {
|
|
7755
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
7756
|
+
cb(Qcur, "Qcur", il);
|
|
7757
|
+
}
|
|
7758
|
+
|
|
7759
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
7760
|
+
cb(Kcur, "Kcur", il);
|
|
7761
|
+
if (model.layers[il].bk) {
|
|
7762
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
7763
|
+
cb(Kcur, "Kcur", il);
|
|
7764
|
+
}
|
|
7765
|
+
|
|
7766
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
7767
|
+
cb(Vcur, "Vcur", il);
|
|
7768
|
+
if (model.layers[il].bv) {
|
|
7769
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
7770
|
+
cb(Vcur, "Vcur", il);
|
|
7771
|
+
}
|
|
7772
|
+
|
|
7773
|
+
Qcur = ggml_rope_custom(
|
|
7774
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7775
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7776
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7777
|
+
);
|
|
7778
|
+
cb(Qcur, "Qcur", il);
|
|
7779
|
+
|
|
7780
|
+
Kcur = ggml_rope_custom(
|
|
7781
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7782
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7783
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7784
|
+
);
|
|
7785
|
+
cb(Kcur, "Kcur", il);
|
|
7786
|
+
|
|
7787
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
7788
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
7789
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7790
|
+
cb(cur, "kqv_out", il);
|
|
7791
|
+
}
|
|
7792
|
+
|
|
7793
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
7794
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
7795
|
+
|
|
7796
|
+
// feed-forward network
|
|
7797
|
+
|
|
7798
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
7799
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
|
7800
|
+
LLM_NORM, cb, il);
|
|
7801
|
+
cb(cur, "ffn_norm", il);
|
|
7802
|
+
|
|
7803
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
7804
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
7805
|
+
NULL, NULL,
|
|
7806
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
7807
|
+
NULL,
|
|
7808
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
7809
|
+
cb(cur, "ffn_out", il);
|
|
7810
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
7811
|
+
cb(cur, "l_out", il);
|
|
7812
|
+
|
|
7813
|
+
// input for next layer
|
|
7814
|
+
inpL = cur;
|
|
7815
|
+
}
|
|
7816
|
+
|
|
7817
|
+
cur = inpL;
|
|
7818
|
+
|
|
7819
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
7820
|
+
model.output_norm, model.output_norm_b,
|
|
7821
|
+
LLM_NORM, cb, -1);
|
|
7822
|
+
cb(cur, "result_norm", -1);
|
|
7823
|
+
|
|
7824
|
+
// lm_head
|
|
7825
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
7826
|
+
cb(cur, "result_output", -1);
|
|
7827
|
+
|
|
7828
|
+
ggml_build_forward_expand(gf, cur);
|
|
7829
|
+
|
|
7830
|
+
return gf;
|
|
7831
|
+
}
|
|
7562
7832
|
};
|
|
7563
7833
|
|
|
7564
7834
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
|
@@ -7705,6 +7975,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
7705
7975
|
{
|
|
7706
7976
|
result = llm.build_gemma();
|
|
7707
7977
|
} break;
|
|
7978
|
+
case LLM_ARCH_STARCODER2:
|
|
7979
|
+
{
|
|
7980
|
+
result = llm.build_starcoder2();
|
|
7981
|
+
} break;
|
|
7708
7982
|
default:
|
|
7709
7983
|
GGML_ASSERT(false);
|
|
7710
7984
|
}
|
|
@@ -7756,7 +8030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
7756
8030
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
|
7757
8031
|
}
|
|
7758
8032
|
|
|
7759
|
-
{
|
|
8033
|
+
if (hparams.causal_attn) {
|
|
7760
8034
|
const int64_t n_kv = kv_self.n;
|
|
7761
8035
|
const int64_t n_tokens = batch.n_tokens;
|
|
7762
8036
|
|
|
@@ -7771,16 +8045,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
7771
8045
|
|
|
7772
8046
|
for (int i = 0; i < n_kv; ++i) {
|
|
7773
8047
|
float f;
|
|
7774
|
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
|
7775
|
-
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
|
8048
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
|
7776
8049
|
f = -INFINITY;
|
|
7777
8050
|
} else {
|
|
7778
|
-
f = 0;
|
|
8051
|
+
f = 0.0f;
|
|
7779
8052
|
}
|
|
7780
8053
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
|
7781
8054
|
}
|
|
7782
8055
|
}
|
|
7783
8056
|
}
|
|
8057
|
+
} else {
|
|
8058
|
+
// non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
|
|
8059
|
+
const int64_t n_tokens = batch.n_tokens;
|
|
8060
|
+
|
|
8061
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
|
8062
|
+
|
|
8063
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
|
8064
|
+
|
|
8065
|
+
for (int h = 0; h < 1; ++h) {
|
|
8066
|
+
for (int j = 0; j < n_tokens; ++j) {
|
|
8067
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
|
8068
|
+
|
|
8069
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
8070
|
+
float f = -INFINITY;
|
|
8071
|
+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
|
8072
|
+
if (batch.seq_id[i][s] == seq_id) {
|
|
8073
|
+
f = 0.0f;
|
|
8074
|
+
break;
|
|
8075
|
+
}
|
|
8076
|
+
}
|
|
8077
|
+
|
|
8078
|
+
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
|
|
8079
|
+
}
|
|
8080
|
+
}
|
|
8081
|
+
}
|
|
7784
8082
|
}
|
|
7785
8083
|
|
|
7786
8084
|
if (hparams.need_kq_pos) {
|
|
@@ -7795,17 +8093,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
7795
8093
|
}
|
|
7796
8094
|
}
|
|
7797
8095
|
|
|
7798
|
-
if (cparams.
|
|
8096
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
|
7799
8097
|
const int64_t n_tokens = batch.n_tokens;
|
|
7800
8098
|
|
|
7801
8099
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
|
7802
|
-
float * data = (float *) lctx.inp_mean->data;
|
|
7803
8100
|
|
|
8101
|
+
float * data = (float *) lctx.inp_mean->data;
|
|
7804
8102
|
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
|
7805
8103
|
|
|
7806
8104
|
std::vector<uint64_t> sum(n_tokens, 0);
|
|
7807
8105
|
for (int i = 0; i < n_tokens; ++i) {
|
|
7808
8106
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
8107
|
+
|
|
8108
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
|
8109
|
+
|
|
7809
8110
|
sum[seq_id] += 1;
|
|
7810
8111
|
}
|
|
7811
8112
|
|
|
@@ -7823,15 +8124,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
7823
8124
|
}
|
|
7824
8125
|
}
|
|
7825
8126
|
|
|
7826
|
-
if (cparams.
|
|
8127
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
|
7827
8128
|
const int64_t n_tokens = batch.n_tokens;
|
|
7828
8129
|
|
|
7829
8130
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
|
8131
|
+
|
|
7830
8132
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
|
8133
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
|
7831
8134
|
|
|
7832
8135
|
for (int i = 0; i < n_tokens; ++i) {
|
|
7833
8136
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
7834
|
-
const llama_pos
|
|
8137
|
+
const llama_pos pos = batch.pos[i];
|
|
8138
|
+
|
|
8139
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
|
8140
|
+
|
|
7835
8141
|
if (pos == 0) {
|
|
7836
8142
|
data[seq_id] = i;
|
|
7837
8143
|
}
|
|
@@ -7856,6 +8162,7 @@ static void llama_graph_compute(
|
|
|
7856
8162
|
|
|
7857
8163
|
if (lctx.backend_cpu != nullptr) {
|
|
7858
8164
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
|
8165
|
+
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
|
7859
8166
|
}
|
|
7860
8167
|
|
|
7861
8168
|
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
|
@@ -7944,23 +8251,26 @@ static int llama_decode_internal(
|
|
|
7944
8251
|
batch.seq_id = seq_id_arr.data();
|
|
7945
8252
|
}
|
|
7946
8253
|
|
|
7947
|
-
|
|
8254
|
+
// non-causal masks do not use the KV cache
|
|
8255
|
+
if (hparams.causal_attn) {
|
|
8256
|
+
llama_kv_cache_update(&lctx);
|
|
7948
8257
|
|
|
7949
|
-
|
|
7950
|
-
|
|
7951
|
-
|
|
7952
|
-
|
|
7953
|
-
|
|
8258
|
+
// if we have enough unused cells before the current head ->
|
|
8259
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
|
8260
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
|
8261
|
+
kv_self.head = 0;
|
|
8262
|
+
}
|
|
7954
8263
|
|
|
7955
|
-
|
|
7956
|
-
|
|
7957
|
-
|
|
8264
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
|
8265
|
+
return 1;
|
|
8266
|
+
}
|
|
7958
8267
|
|
|
7959
|
-
|
|
7960
|
-
|
|
7961
|
-
|
|
7962
|
-
|
|
7963
|
-
|
|
8268
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
|
8269
|
+
// after enough generations, the benefit from this heuristic disappears
|
|
8270
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
|
8271
|
+
kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
|
8272
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
|
8273
|
+
}
|
|
7964
8274
|
|
|
7965
8275
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
|
7966
8276
|
|
|
@@ -7970,20 +8280,26 @@ static int llama_decode_internal(
|
|
|
7970
8280
|
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
|
7971
8281
|
|
|
7972
8282
|
// the output is always the last tensor in the graph
|
|
7973
|
-
struct ggml_tensor * res
|
|
7974
|
-
struct ggml_tensor *
|
|
7975
|
-
|
|
7976
|
-
if (
|
|
7977
|
-
|
|
7978
|
-
|
|
7979
|
-
|
|
7980
|
-
|
|
7981
|
-
|
|
7982
|
-
|
|
7983
|
-
embeddings = res;
|
|
7984
|
-
res = nullptr;
|
|
8283
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
8284
|
+
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
|
8285
|
+
|
|
8286
|
+
if (!hparams.causal_attn) {
|
|
8287
|
+
res = nullptr; // do not extract logits for embedding models such as BERT
|
|
8288
|
+
|
|
8289
|
+
// token or sequence embeddings
|
|
8290
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
|
8291
|
+
|
|
8292
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
|
7985
8293
|
} else {
|
|
7986
|
-
|
|
8294
|
+
if (strcmp(res->name, "result_output") == 0) {
|
|
8295
|
+
// the token embeddings could be the second to last tensor, or the third to last tensor
|
|
8296
|
+
if (strcmp(embd->name, "result_norm") != 0) {
|
|
8297
|
+
embd = gf->nodes[gf->n_nodes - 3];
|
|
8298
|
+
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
|
|
8299
|
+
}
|
|
8300
|
+
} else {
|
|
8301
|
+
GGML_ASSERT(false && "missing result_output tensor");
|
|
8302
|
+
}
|
|
7987
8303
|
}
|
|
7988
8304
|
|
|
7989
8305
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
@@ -8050,46 +8366,82 @@ static int llama_decode_internal(
|
|
|
8050
8366
|
logits_out.clear();
|
|
8051
8367
|
#endif
|
|
8052
8368
|
|
|
8053
|
-
ggml_backend_t
|
|
8054
|
-
GGML_ASSERT(
|
|
8369
|
+
ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
|
8370
|
+
GGML_ASSERT(backend_res != nullptr);
|
|
8371
|
+
|
|
8055
8372
|
if (batch.logits) {
|
|
8056
8373
|
logits_out.resize(n_vocab * n_tokens);
|
|
8057
8374
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
8058
8375
|
if (batch.logits[i] == 0) {
|
|
8059
8376
|
continue;
|
|
8060
8377
|
}
|
|
8061
|
-
ggml_backend_tensor_get_async(
|
|
8378
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
|
8062
8379
|
#ifndef NDEBUG
|
|
8063
8380
|
logits_valid[i] = true;
|
|
8064
8381
|
#endif
|
|
8065
8382
|
}
|
|
8066
8383
|
} else if (lctx.logits_all) {
|
|
8067
8384
|
logits_out.resize(n_vocab * n_tokens);
|
|
8068
|
-
ggml_backend_tensor_get_async(
|
|
8385
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
|
8069
8386
|
#ifndef NDEBUG
|
|
8070
8387
|
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
|
8071
8388
|
#endif
|
|
8072
8389
|
} else {
|
|
8073
8390
|
logits_out.resize(n_vocab);
|
|
8074
|
-
ggml_backend_tensor_get_async(
|
|
8391
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
|
8075
8392
|
#ifndef NDEBUG
|
|
8076
8393
|
logits_valid[0] = true;
|
|
8077
8394
|
#endif
|
|
8078
8395
|
}
|
|
8079
|
-
ggml_backend_synchronize(
|
|
8396
|
+
ggml_backend_synchronize(backend_res);
|
|
8080
8397
|
}
|
|
8081
8398
|
|
|
8082
8399
|
// extract embeddings
|
|
8083
|
-
if (
|
|
8084
|
-
|
|
8400
|
+
if (cparams.embeddings && embd) {
|
|
8401
|
+
ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
|
|
8402
|
+
GGML_ASSERT(backend_embd != nullptr);
|
|
8403
|
+
|
|
8404
|
+
switch (cparams.pooling_type) {
|
|
8405
|
+
case LLAMA_POOLING_TYPE_NONE:
|
|
8406
|
+
{
|
|
8407
|
+
// extract token embeddings
|
|
8408
|
+
auto & embd_out = lctx.embd;
|
|
8409
|
+
|
|
8410
|
+
if (batch.logits) {
|
|
8411
|
+
embd_out.resize(n_embd * n_tokens);
|
|
8412
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
8413
|
+
if (batch.logits[i] == 0) {
|
|
8414
|
+
continue;
|
|
8415
|
+
}
|
|
8416
|
+
|
|
8417
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
|
8418
|
+
}
|
|
8419
|
+
}
|
|
8420
|
+
} break;
|
|
8421
|
+
case LLAMA_POOLING_TYPE_CLS:
|
|
8422
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
|
8423
|
+
{
|
|
8424
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
|
8085
8425
|
|
|
8086
|
-
|
|
8087
|
-
|
|
8426
|
+
// extract sequence embeddings
|
|
8427
|
+
auto & embd_seq_out = lctx.embd_seq;
|
|
8428
|
+
embd_seq_out.clear();
|
|
8088
8429
|
|
|
8089
|
-
|
|
8090
|
-
|
|
8091
|
-
|
|
8092
|
-
|
|
8430
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
8431
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
8432
|
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
|
8433
|
+
continue;
|
|
8434
|
+
}
|
|
8435
|
+
embd_seq_out[seq_id].resize(n_embd);
|
|
8436
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
|
8437
|
+
}
|
|
8438
|
+
} break;
|
|
8439
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
|
8440
|
+
{
|
|
8441
|
+
GGML_ASSERT(false && "unknown pooling type");
|
|
8442
|
+
} break;
|
|
8443
|
+
}
|
|
8444
|
+
ggml_backend_synchronize(backend_embd);
|
|
8093
8445
|
}
|
|
8094
8446
|
|
|
8095
8447
|
// measure the performance only for the single-token evals
|
|
@@ -8383,19 +8735,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
|
8383
8735
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
|
8384
8736
|
const auto& token_data = vocab.id_to_token.at(id);
|
|
8385
8737
|
switch (llama_vocab_get_type(vocab)) {
|
|
8386
|
-
|
|
8387
|
-
|
|
8388
|
-
|
|
8389
|
-
|
|
8390
|
-
|
|
8391
|
-
|
|
8392
|
-
|
|
8393
|
-
|
|
8394
|
-
|
|
8395
|
-
|
|
8396
|
-
|
|
8397
|
-
|
|
8398
|
-
|
|
8738
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
|
8739
|
+
auto buf = token_data.text.substr(3, 2);
|
|
8740
|
+
return strtol(buf.c_str(), NULL, 16);
|
|
8741
|
+
}
|
|
8742
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
|
8743
|
+
GGML_ASSERT(false);
|
|
8744
|
+
return unicode_to_bytes_bpe(token_data.text);
|
|
8745
|
+
}
|
|
8746
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
|
8747
|
+
GGML_ASSERT(false);
|
|
8748
|
+
}
|
|
8749
|
+
default:
|
|
8750
|
+
GGML_ASSERT(false);
|
|
8399
8751
|
}
|
|
8400
8752
|
}
|
|
8401
8753
|
|
|
@@ -10621,7 +10973,7 @@ struct quantize_state_internal {
|
|
|
10621
10973
|
{}
|
|
10622
10974
|
};
|
|
10623
10975
|
|
|
10624
|
-
static void
|
|
10976
|
+
static void llama_tensor_dequantize_internal(
|
|
10625
10977
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
|
10626
10978
|
const size_t nelements, const int nthread
|
|
10627
10979
|
) {
|
|
@@ -10962,6 +11314,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
10962
11314
|
return new_type;
|
|
10963
11315
|
}
|
|
10964
11316
|
|
|
11317
|
+
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
|
11318
|
+
std::mutex mutex;
|
|
11319
|
+
int counter = 0;
|
|
11320
|
+
size_t new_size = 0;
|
|
11321
|
+
if (nthread < 2) {
|
|
11322
|
+
// single-thread
|
|
11323
|
+
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
|
|
11324
|
+
}
|
|
11325
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
|
11326
|
+
nrows, n_per_row, imatrix]() {
|
|
11327
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
|
11328
|
+
const int nrows_per_chunk = chunk_size / n_per_row;
|
|
11329
|
+
size_t local_size = 0;
|
|
11330
|
+
while (true) {
|
|
11331
|
+
std::unique_lock<std::mutex> lock(mutex);
|
|
11332
|
+
int first_row = counter; counter += nrows_per_chunk;
|
|
11333
|
+
if (first_row >= nrows) {
|
|
11334
|
+
if (local_size > 0) {
|
|
11335
|
+
for (int j=0; j<int(local_hist.size()); ++j) {
|
|
11336
|
+
hist_cur[j] += local_hist[j];
|
|
11337
|
+
}
|
|
11338
|
+
new_size += local_size;
|
|
11339
|
+
}
|
|
11340
|
+
break;
|
|
11341
|
+
}
|
|
11342
|
+
lock.unlock();
|
|
11343
|
+
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
|
11344
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
|
11345
|
+
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
|
11346
|
+
}
|
|
11347
|
+
};
|
|
11348
|
+
for (int it = 0; it < nthread - 1; ++it) {
|
|
11349
|
+
workers.emplace_back(compute);
|
|
11350
|
+
}
|
|
11351
|
+
compute();
|
|
11352
|
+
for (auto & w : workers) { w.join(); }
|
|
11353
|
+
workers.clear();
|
|
11354
|
+
return new_size;
|
|
11355
|
+
}
|
|
11356
|
+
|
|
10965
11357
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
|
10966
11358
|
ggml_type quantized_type;
|
|
10967
11359
|
llama_ftype ftype = params->ftype;
|
|
@@ -11074,7 +11466,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
11074
11466
|
|
|
11075
11467
|
std::vector<std::thread> workers;
|
|
11076
11468
|
workers.reserve(nthread);
|
|
11077
|
-
std::mutex mutex;
|
|
11078
11469
|
|
|
11079
11470
|
int idx = 0;
|
|
11080
11471
|
|
|
@@ -11188,7 +11579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
11188
11579
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
|
11189
11580
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
|
11190
11581
|
} else {
|
|
11191
|
-
|
|
11582
|
+
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
|
11192
11583
|
f32_data = (float *) f32_conv_buf.data();
|
|
11193
11584
|
}
|
|
11194
11585
|
|
|
@@ -11209,41 +11600,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
11209
11600
|
|
|
11210
11601
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
|
11211
11602
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
|
11212
|
-
|
|
11213
|
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
|
11214
|
-
} else {
|
|
11215
|
-
int counter = 0;
|
|
11216
|
-
new_size = 0;
|
|
11217
|
-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
|
11218
|
-
nrows, n_per_row, imatrix]() {
|
|
11219
|
-
std::array<int64_t, 1 << 4> local_hist = {};
|
|
11220
|
-
const int nrows_per_chunk = chunk_size / n_per_row;
|
|
11221
|
-
size_t local_size = 0;
|
|
11222
|
-
while (true) {
|
|
11223
|
-
std::unique_lock<std::mutex> lock(mutex);
|
|
11224
|
-
int first_row = counter; counter += nrows_per_chunk;
|
|
11225
|
-
if (first_row >= nrows) {
|
|
11226
|
-
if (local_size > 0) {
|
|
11227
|
-
for (int j=0; j<int(local_hist.size()); ++j) {
|
|
11228
|
-
hist_cur[j] += local_hist[j];
|
|
11229
|
-
}
|
|
11230
|
-
new_size += local_size;
|
|
11231
|
-
}
|
|
11232
|
-
break;
|
|
11233
|
-
}
|
|
11234
|
-
lock.unlock();
|
|
11235
|
-
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
|
11236
|
-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
|
11237
|
-
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
|
11238
|
-
}
|
|
11239
|
-
};
|
|
11240
|
-
for (int it = 0; it < nthread_use - 1; ++it) {
|
|
11241
|
-
workers.emplace_back(compute);
|
|
11242
|
-
}
|
|
11243
|
-
compute();
|
|
11244
|
-
for (auto & w : workers) { w.join(); }
|
|
11245
|
-
workers.clear();
|
|
11246
|
-
}
|
|
11603
|
+
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
|
|
11247
11604
|
|
|
11248
11605
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
11249
11606
|
int64_t tot_count = 0;
|
|
@@ -11620,6 +11977,7 @@ struct llama_context_params llama_context_default_params() {
|
|
|
11620
11977
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
|
11621
11978
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
|
11622
11979
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
|
11980
|
+
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
|
11623
11981
|
/*.rope_freq_base =*/ 0.0f,
|
|
11624
11982
|
/*.rope_freq_scale =*/ 0.0f,
|
|
11625
11983
|
/*.yarn_ext_factor =*/ -1.0f,
|
|
@@ -11633,9 +11991,10 @@ struct llama_context_params llama_context_default_params() {
|
|
|
11633
11991
|
/*.type_k =*/ GGML_TYPE_F16,
|
|
11634
11992
|
/*.type_v =*/ GGML_TYPE_F16,
|
|
11635
11993
|
/*.logits_all =*/ false,
|
|
11636
|
-
/*.
|
|
11994
|
+
/*.embeddings =*/ false,
|
|
11637
11995
|
/*.offload_kqv =*/ true,
|
|
11638
|
-
/*.
|
|
11996
|
+
/*.abort_callback =*/ nullptr,
|
|
11997
|
+
/*.abort_callback_data =*/ nullptr,
|
|
11639
11998
|
};
|
|
11640
11999
|
|
|
11641
12000
|
return result;
|
|
@@ -11783,8 +12142,9 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11783
12142
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
|
11784
12143
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
|
11785
12144
|
cparams.defrag_thold = params.defrag_thold;
|
|
12145
|
+
cparams.embeddings = params.embeddings;
|
|
11786
12146
|
cparams.offload_kqv = params.offload_kqv;
|
|
11787
|
-
cparams.
|
|
12147
|
+
cparams.pooling_type = params.pooling_type;
|
|
11788
12148
|
|
|
11789
12149
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
|
11790
12150
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
|
@@ -11810,6 +12170,14 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11810
12170
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
|
11811
12171
|
}
|
|
11812
12172
|
|
|
12173
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
|
12174
|
+
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
|
12175
|
+
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
12176
|
+
} else {
|
|
12177
|
+
cparams.pooling_type = hparams.pooling_type;
|
|
12178
|
+
}
|
|
12179
|
+
}
|
|
12180
|
+
|
|
11813
12181
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
11814
12182
|
params.seed = time(NULL);
|
|
11815
12183
|
}
|
|
@@ -11818,8 +12186,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11818
12186
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
|
11819
12187
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
|
11820
12188
|
|
|
11821
|
-
ctx->
|
|
11822
|
-
ctx->
|
|
12189
|
+
ctx->abort_callback = params.abort_callback;
|
|
12190
|
+
ctx->abort_callback_data = params.abort_callback_data;
|
|
12191
|
+
|
|
12192
|
+
ctx->rng = std::mt19937(params.seed);
|
|
12193
|
+
ctx->logits_all = params.logits_all;
|
|
11823
12194
|
|
|
11824
12195
|
const ggml_type type_k = params.type_k;
|
|
11825
12196
|
const ggml_type type_v = params.type_v;
|
|
@@ -11877,13 +12248,31 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11877
12248
|
}
|
|
11878
12249
|
#elif defined(GGML_USE_SYCL)
|
|
11879
12250
|
if (model->n_gpu_layers > 0) {
|
|
11880
|
-
|
|
11881
|
-
if (
|
|
11882
|
-
|
|
11883
|
-
|
|
11884
|
-
|
|
12251
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
|
12252
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
12253
|
+
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
|
12254
|
+
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
|
12255
|
+
if (backend == nullptr) {
|
|
12256
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
|
12257
|
+
llama_free(ctx);
|
|
12258
|
+
return nullptr;
|
|
12259
|
+
}
|
|
12260
|
+
ctx->backends.push_back(backend);
|
|
12261
|
+
} else {
|
|
12262
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
|
12263
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
|
12264
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
|
12265
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
|
12266
|
+
int device_id = id_list[i];
|
|
12267
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
|
12268
|
+
if (backend == nullptr) {
|
|
12269
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
|
12270
|
+
llama_free(ctx);
|
|
12271
|
+
return nullptr;
|
|
12272
|
+
}
|
|
12273
|
+
ctx->backends.push_back(backend);
|
|
12274
|
+
}
|
|
11885
12275
|
}
|
|
11886
|
-
ctx->backends.push_back(backend);
|
|
11887
12276
|
}
|
|
11888
12277
|
#elif defined(GGML_USE_KOMPUTE)
|
|
11889
12278
|
if (model->n_gpu_layers > 0) {
|
|
@@ -11931,8 +12320,8 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11931
12320
|
// resized during inference, reserve maximum
|
|
11932
12321
|
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
|
11933
12322
|
|
|
11934
|
-
if (params.
|
|
11935
|
-
ctx->
|
|
12323
|
+
if (params.embeddings) {
|
|
12324
|
+
ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
|
|
11936
12325
|
}
|
|
11937
12326
|
|
|
11938
12327
|
// graph inputs
|
|
@@ -11963,7 +12352,6 @@ struct llama_context * llama_new_context_with_model(
|
|
|
11963
12352
|
ggml_set_name(ctx->inp_cls, "inp_cls");
|
|
11964
12353
|
|
|
11965
12354
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
|
11966
|
-
|
|
11967
12355
|
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
|
11968
12356
|
ggml_backend_buffer_name(ctx->buf_input),
|
|
11969
12357
|
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
|
@@ -12084,6 +12472,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
12084
12472
|
case LLM_ARCH_QWEN2:
|
|
12085
12473
|
case LLM_ARCH_PHI2:
|
|
12086
12474
|
case LLM_ARCH_GEMMA:
|
|
12475
|
+
case LLM_ARCH_STARCODER2:
|
|
12087
12476
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
12088
12477
|
|
|
12089
12478
|
// all model arches should be listed explicitly here
|
|
@@ -12367,10 +12756,15 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
|
12367
12756
|
// assume worst case for logits although only currently set ones are serialized
|
|
12368
12757
|
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
|
12369
12758
|
const size_t s_embedding_size = sizeof(size_t);
|
|
12370
|
-
const size_t s_embedding = ctx->
|
|
12371
|
-
const size_t
|
|
12372
|
-
const size_t
|
|
12759
|
+
const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
|
|
12760
|
+
const size_t s_kv_buf_size = sizeof(size_t);
|
|
12761
|
+
const size_t s_kv_head = sizeof(uint32_t);
|
|
12762
|
+
const size_t s_kv_size = sizeof(uint32_t);
|
|
12763
|
+
const size_t s_kv_used = sizeof(uint32_t);
|
|
12373
12764
|
const size_t s_kv = ctx->kv_self.total_size();
|
|
12765
|
+
// TODO: assume the max is more than 1 seq_id per KV cell
|
|
12766
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
|
12767
|
+
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
|
12374
12768
|
|
|
12375
12769
|
const size_t s_total = (
|
|
12376
12770
|
+ s_rng_size
|
|
@@ -12379,9 +12773,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
|
12379
12773
|
+ s_logits
|
|
12380
12774
|
+ s_embedding_size
|
|
12381
12775
|
+ s_embedding
|
|
12776
|
+
+ s_kv_buf_size
|
|
12777
|
+
+ s_kv_head
|
|
12382
12778
|
+ s_kv_size
|
|
12383
|
-
+
|
|
12779
|
+
+ s_kv_used
|
|
12384
12780
|
+ s_kv
|
|
12781
|
+
+ s_kv_cells
|
|
12385
12782
|
);
|
|
12386
12783
|
|
|
12387
12784
|
return s_total;
|
|
@@ -12468,12 +12865,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
|
12468
12865
|
|
|
12469
12866
|
// copy embeddings
|
|
12470
12867
|
{
|
|
12471
|
-
const size_t
|
|
12868
|
+
const size_t embeddings_size = ctx->embd.size();
|
|
12472
12869
|
|
|
12473
|
-
data_ctx->write(&
|
|
12870
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
|
12474
12871
|
|
|
12475
|
-
if (
|
|
12476
|
-
data_ctx->write(ctx->
|
|
12872
|
+
if (embeddings_size) {
|
|
12873
|
+
data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
|
|
12477
12874
|
}
|
|
12478
12875
|
}
|
|
12479
12876
|
|
|
@@ -12481,15 +12878,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
|
12481
12878
|
{
|
|
12482
12879
|
const auto & kv_self = ctx->kv_self;
|
|
12483
12880
|
const auto & hparams = ctx->model.hparams;
|
|
12484
|
-
const auto & cparams = ctx->cparams;
|
|
12485
12881
|
|
|
12486
12882
|
const uint32_t n_layer = hparams.n_layer;
|
|
12487
12883
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
12488
12884
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
12489
|
-
const uint32_t n_ctx = cparams.n_ctx;
|
|
12490
12885
|
|
|
12491
12886
|
const size_t kv_buf_size = kv_self.total_size();
|
|
12492
|
-
const uint32_t kv_head = kv_self
|
|
12887
|
+
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
|
12493
12888
|
const uint32_t kv_size = kv_self.size;
|
|
12494
12889
|
const uint32_t kv_used = kv_self.used;
|
|
12495
12890
|
|
|
@@ -12509,7 +12904,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
|
12509
12904
|
|
|
12510
12905
|
// v is not contiguous, copy row by row
|
|
12511
12906
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
|
12512
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
|
12907
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
|
12513
12908
|
|
|
12514
12909
|
tmp_buf.resize(v_row_size);
|
|
12515
12910
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
|
@@ -12519,7 +12914,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
|
12519
12914
|
}
|
|
12520
12915
|
}
|
|
12521
12916
|
|
|
12522
|
-
for (uint32_t i = 0; i <
|
|
12917
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
|
12523
12918
|
const auto & cell = kv_self.cells[i];
|
|
12524
12919
|
|
|
12525
12920
|
const llama_pos pos = cell.pos;
|
|
@@ -12579,15 +12974,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
12579
12974
|
|
|
12580
12975
|
// set embeddings
|
|
12581
12976
|
{
|
|
12582
|
-
size_t
|
|
12977
|
+
size_t embeddings_size;
|
|
12583
12978
|
|
|
12584
|
-
memcpy(&
|
|
12979
|
+
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
|
12585
12980
|
|
|
12586
|
-
GGML_ASSERT(ctx->
|
|
12981
|
+
GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
|
|
12587
12982
|
|
|
12588
|
-
if (
|
|
12589
|
-
|
|
12590
|
-
|
|
12983
|
+
if (embeddings_size) {
|
|
12984
|
+
ctx->embd.resize(embeddings_size);
|
|
12985
|
+
|
|
12986
|
+
memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
|
|
12987
|
+
inp += embeddings_size * sizeof(float);
|
|
12591
12988
|
}
|
|
12592
12989
|
}
|
|
12593
12990
|
|
|
@@ -12595,12 +12992,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
12595
12992
|
{
|
|
12596
12993
|
const auto & kv_self = ctx->kv_self;
|
|
12597
12994
|
const auto & hparams = ctx->model.hparams;
|
|
12598
|
-
const auto & cparams = ctx->cparams;
|
|
12599
12995
|
|
|
12600
12996
|
const uint32_t n_layer = hparams.n_layer;
|
|
12601
12997
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
12602
12998
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
12603
|
-
const uint32_t n_ctx = cparams.n_ctx;
|
|
12604
12999
|
|
|
12605
13000
|
size_t kv_buf_size;
|
|
12606
13001
|
uint32_t kv_head;
|
|
@@ -12623,7 +13018,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
12623
13018
|
|
|
12624
13019
|
// v is not contiguous, copy row by row
|
|
12625
13020
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
|
12626
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
|
13021
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
|
12627
13022
|
|
|
12628
13023
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
|
12629
13024
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
|
@@ -12632,13 +13027,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
12632
13027
|
}
|
|
12633
13028
|
}
|
|
12634
13029
|
|
|
13030
|
+
GGML_ASSERT(kv_self.size == kv_size);
|
|
13031
|
+
|
|
12635
13032
|
ctx->kv_self.head = kv_head;
|
|
12636
13033
|
ctx->kv_self.size = kv_size;
|
|
12637
13034
|
ctx->kv_self.used = kv_used;
|
|
12638
13035
|
|
|
12639
13036
|
ctx->kv_self.cells.resize(kv_size);
|
|
12640
13037
|
|
|
12641
|
-
for (uint32_t i = 0; i <
|
|
13038
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
|
12642
13039
|
llama_pos pos;
|
|
12643
13040
|
size_t seq_id_size;
|
|
12644
13041
|
|
|
@@ -12654,6 +13051,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
|
12654
13051
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
|
12655
13052
|
}
|
|
12656
13053
|
}
|
|
13054
|
+
|
|
13055
|
+
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
|
13056
|
+
ctx->kv_self.cells[i].pos = -1;
|
|
13057
|
+
ctx->kv_self.cells[i].seq_id.clear();
|
|
13058
|
+
}
|
|
12657
13059
|
}
|
|
12658
13060
|
|
|
12659
13061
|
const size_t nread = inp - src;
|
|
@@ -12751,6 +13153,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
|
12751
13153
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
|
12752
13154
|
}
|
|
12753
13155
|
|
|
13156
|
+
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
|
13157
|
+
ctx->abort_callback = abort_callback;
|
|
13158
|
+
ctx->abort_callback_data = abort_callback_data;
|
|
13159
|
+
}
|
|
13160
|
+
|
|
12754
13161
|
struct llama_batch llama_batch_get_one(
|
|
12755
13162
|
llama_token * tokens,
|
|
12756
13163
|
int32_t n_tokens,
|
|
@@ -12827,11 +13234,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|
|
12827
13234
|
}
|
|
12828
13235
|
|
|
12829
13236
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
|
12830
|
-
return ctx->
|
|
13237
|
+
return ctx->embd.data();
|
|
12831
13238
|
}
|
|
12832
13239
|
|
|
12833
13240
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
|
12834
|
-
return ctx->
|
|
13241
|
+
return ctx->embd.data() + i*ctx->model.hparams.n_embd;
|
|
13242
|
+
}
|
|
13243
|
+
|
|
13244
|
+
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
13245
|
+
auto it = ctx->embd_seq.find(seq_id);
|
|
13246
|
+
if (it == ctx->embd_seq.end()) {
|
|
13247
|
+
return nullptr;
|
|
13248
|
+
}
|
|
13249
|
+
|
|
13250
|
+
return it->second.data();
|
|
12835
13251
|
}
|
|
12836
13252
|
|
|
12837
13253
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
|
@@ -13005,7 +13421,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
13005
13421
|
std::string & dest, bool add_ass) {
|
|
13006
13422
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
|
13007
13423
|
std::stringstream ss;
|
|
13008
|
-
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
|
13424
|
+
if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
|
|
13009
13425
|
// chatml template
|
|
13010
13426
|
for (auto message : chat) {
|
|
13011
13427
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
|
@@ -13013,7 +13429,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
13013
13429
|
if (add_ass) {
|
|
13014
13430
|
ss << "<|im_start|>assistant\n";
|
|
13015
13431
|
}
|
|
13016
|
-
} else if (tmpl.find("[INST]") != std::string::npos) {
|
|
13432
|
+
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
|
|
13017
13433
|
// llama2 template and its variants
|
|
13018
13434
|
// [variant] support system message
|
|
13019
13435
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
|
@@ -13048,7 +13464,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
13048
13464
|
}
|
|
13049
13465
|
}
|
|
13050
13466
|
// llama2 templates seem to not care about "add_generation_prompt"
|
|
13051
|
-
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
|
13467
|
+
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
|
13052
13468
|
// zephyr template
|
|
13053
13469
|
for (auto message : chat) {
|
|
13054
13470
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
|
@@ -13056,7 +13472,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
13056
13472
|
if (add_ass) {
|
|
13057
13473
|
ss << "<|assistant|>\n";
|
|
13058
13474
|
}
|
|
13059
|
-
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
|
13475
|
+
} else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
|
|
13060
13476
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
|
13061
13477
|
for (auto message : chat) {
|
|
13062
13478
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
|
@@ -13065,7 +13481,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
13065
13481
|
if (add_ass) {
|
|
13066
13482
|
ss << "<s>assistant\n";
|
|
13067
13483
|
}
|
|
13068
|
-
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
|
13484
|
+
} else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
|
13069
13485
|
// google/gemma-7b-it
|
|
13070
13486
|
std::string system_prompt = "";
|
|
13071
13487
|
for (auto message : chat) {
|
|
@@ -13112,23 +13528,27 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
|
13112
13528
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
|
13113
13529
|
if (res < 0) {
|
|
13114
13530
|
// worst case: there is no information about template, we will use chatml by default
|
|
13115
|
-
curr_tmpl = "
|
|
13531
|
+
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
|
13116
13532
|
} else {
|
|
13117
13533
|
curr_tmpl = std::string(model_template.data(), model_template.size());
|
|
13118
13534
|
}
|
|
13119
13535
|
}
|
|
13536
|
+
|
|
13120
13537
|
// format the chat to string
|
|
13121
13538
|
std::vector<const llama_chat_message *> chat_vec;
|
|
13122
13539
|
chat_vec.resize(n_msg);
|
|
13123
13540
|
for (size_t i = 0; i < n_msg; i++) {
|
|
13124
13541
|
chat_vec[i] = &chat[i];
|
|
13125
13542
|
}
|
|
13543
|
+
|
|
13126
13544
|
std::string formatted_chat;
|
|
13127
13545
|
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
|
13128
13546
|
if (res < 0) {
|
|
13129
13547
|
return res;
|
|
13130
13548
|
}
|
|
13131
|
-
|
|
13549
|
+
if (buf && length > 0) {
|
|
13550
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
|
13551
|
+
}
|
|
13132
13552
|
return res;
|
|
13133
13553
|
}
|
|
13134
13554
|
|