llama_cpp 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -104,6 +104,7 @@
|
|
104
104
|
#define LLAMA_MAX_NODES 8192
|
105
105
|
#define LLAMA_MAX_EXPERTS 8
|
106
106
|
|
107
|
+
|
107
108
|
//
|
108
109
|
// logging
|
109
110
|
//
|
@@ -211,10 +212,12 @@ enum llm_arch {
|
|
211
212
|
LLM_ARCH_INTERNLM2,
|
212
213
|
LLM_ARCH_MINICPM,
|
213
214
|
LLM_ARCH_GEMMA,
|
215
|
+
LLM_ARCH_STARCODER2,
|
216
|
+
LLM_ARCH_MAMBA,
|
214
217
|
LLM_ARCH_UNKNOWN,
|
215
218
|
};
|
216
219
|
|
217
|
-
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
220
|
+
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
218
221
|
{ LLM_ARCH_LLAMA, "llama" },
|
219
222
|
{ LLM_ARCH_FALCON, "falcon" },
|
220
223
|
{ LLM_ARCH_GPT2, "gpt2" },
|
@@ -238,6 +241,9 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
238
241
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
239
242
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
240
243
|
{ LLM_ARCH_GEMMA, "gemma" },
|
244
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
245
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
246
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
241
247
|
};
|
242
248
|
|
243
249
|
enum llm_kv {
|
@@ -252,6 +258,7 @@ enum llm_kv {
|
|
252
258
|
LLM_KV_GENERAL_SOURCE_URL,
|
253
259
|
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
254
260
|
|
261
|
+
LLM_KV_VOCAB_SIZE,
|
255
262
|
LLM_KV_CONTEXT_LENGTH,
|
256
263
|
LLM_KV_EMBEDDING_LENGTH,
|
257
264
|
LLM_KV_BLOCK_COUNT,
|
@@ -280,6 +287,11 @@ enum llm_kv {
|
|
280
287
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
281
288
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
282
289
|
|
290
|
+
LLM_KV_SSM_INNER_SIZE,
|
291
|
+
LLM_KV_SSM_CONV_KERNEL,
|
292
|
+
LLM_KV_SSM_STATE_SIZE,
|
293
|
+
LLM_KV_SSM_TIME_STEP_RANK,
|
294
|
+
|
283
295
|
LLM_KV_TOKENIZER_MODEL,
|
284
296
|
LLM_KV_TOKENIZER_LIST,
|
285
297
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
@@ -298,7 +310,7 @@ enum llm_kv {
|
|
298
310
|
LLM_KV_TOKENIZER_RWKV,
|
299
311
|
};
|
300
312
|
|
301
|
-
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
313
|
+
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
302
314
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
303
315
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
304
316
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
@@ -310,6 +322,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
310
322
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
311
323
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
312
324
|
|
325
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
313
326
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
314
327
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
315
328
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
@@ -338,6 +351,11 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
338
351
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
339
352
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
340
353
|
|
354
|
+
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
355
|
+
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
356
|
+
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
357
|
+
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
358
|
+
|
341
359
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
342
360
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
343
361
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
@@ -362,7 +380,7 @@ struct LLM_KV {
|
|
362
380
|
llm_arch arch;
|
363
381
|
|
364
382
|
std::string operator()(llm_kv kv) const {
|
365
|
-
return ::format(LLM_KV_NAMES
|
383
|
+
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
366
384
|
}
|
367
385
|
};
|
368
386
|
|
@@ -395,9 +413,16 @@ enum llm_tensor {
|
|
395
413
|
LLM_TENSOR_ATTN_Q_NORM,
|
396
414
|
LLM_TENSOR_ATTN_K_NORM,
|
397
415
|
LLM_TENSOR_LAYER_OUT_NORM,
|
416
|
+
LLM_TENSOR_SSM_IN,
|
417
|
+
LLM_TENSOR_SSM_CONV1D,
|
418
|
+
LLM_TENSOR_SSM_X,
|
419
|
+
LLM_TENSOR_SSM_DT,
|
420
|
+
LLM_TENSOR_SSM_A,
|
421
|
+
LLM_TENSOR_SSM_D,
|
422
|
+
LLM_TENSOR_SSM_OUT,
|
398
423
|
};
|
399
424
|
|
400
|
-
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
425
|
+
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
401
426
|
{
|
402
427
|
LLM_ARCH_LLAMA,
|
403
428
|
{
|
@@ -779,6 +804,40 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
779
804
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
780
805
|
},
|
781
806
|
},
|
807
|
+
{
|
808
|
+
LLM_ARCH_STARCODER2,
|
809
|
+
{
|
810
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
811
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
812
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
813
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
814
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
815
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
816
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
817
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
818
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
819
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
821
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
822
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
823
|
+
},
|
824
|
+
},
|
825
|
+
{
|
826
|
+
LLM_ARCH_MAMBA,
|
827
|
+
{
|
828
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
829
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
830
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
831
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
832
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
833
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
834
|
+
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
|
835
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
836
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
837
|
+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
838
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
839
|
+
},
|
840
|
+
},
|
782
841
|
{
|
783
842
|
LLM_ARCH_UNKNOWN,
|
784
843
|
{
|
@@ -812,38 +871,38 @@ struct LLM_TN {
|
|
812
871
|
llm_arch arch;
|
813
872
|
|
814
873
|
std::string operator()(llm_tensor tensor) const {
|
815
|
-
if (LLM_TENSOR_NAMES
|
874
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
816
875
|
return "__missing__";
|
817
876
|
}
|
818
|
-
return LLM_TENSOR_NAMES
|
877
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
819
878
|
}
|
820
879
|
|
821
880
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
822
|
-
if (LLM_TENSOR_NAMES
|
881
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
823
882
|
return "__missing__";
|
824
883
|
}
|
825
|
-
return LLM_TENSOR_NAMES
|
884
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
826
885
|
}
|
827
886
|
|
828
887
|
std::string operator()(llm_tensor tensor, int bid) const {
|
829
|
-
if (LLM_TENSOR_NAMES
|
888
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
830
889
|
return "__missing__";
|
831
890
|
}
|
832
|
-
return ::format(LLM_TENSOR_NAMES
|
891
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
|
833
892
|
}
|
834
893
|
|
835
894
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
836
|
-
if (LLM_TENSOR_NAMES
|
895
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
837
896
|
return "__missing__";
|
838
897
|
}
|
839
|
-
return ::format(LLM_TENSOR_NAMES
|
898
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
|
840
899
|
}
|
841
900
|
|
842
901
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
843
|
-
if (LLM_TENSOR_NAMES
|
902
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
844
903
|
return "__missing__";
|
845
904
|
}
|
846
|
-
return ::format(LLM_TENSOR_NAMES
|
905
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
|
847
906
|
}
|
848
907
|
};
|
849
908
|
|
@@ -851,16 +910,16 @@ struct LLM_TN {
|
|
851
910
|
// gguf helpers
|
852
911
|
//
|
853
912
|
|
854
|
-
static std::map<
|
913
|
+
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
855
914
|
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
856
915
|
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
857
916
|
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
858
917
|
};
|
859
918
|
|
860
|
-
static
|
919
|
+
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
861
920
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
862
921
|
if (kv.second == name) {
|
863
|
-
return kv.first;
|
922
|
+
return (llama_rope_scaling_type) kv.first;
|
864
923
|
}
|
865
924
|
}
|
866
925
|
|
@@ -921,21 +980,6 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
921
980
|
}
|
922
981
|
}
|
923
982
|
|
924
|
-
//
|
925
|
-
// ggml helpers
|
926
|
-
//
|
927
|
-
|
928
|
-
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
929
|
-
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
930
|
-
|
931
|
-
if (plan.work_size > 0) {
|
932
|
-
buf.resize(plan.work_size);
|
933
|
-
plan.work_data = buf.data();
|
934
|
-
}
|
935
|
-
|
936
|
-
ggml_graph_compute(graph, &plan);
|
937
|
-
}
|
938
|
-
|
939
983
|
//
|
940
984
|
// llama helpers
|
941
985
|
//
|
@@ -1409,7 +1453,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1409
1453
|
buft = ggml_backend_cuda_host_buffer_type();
|
1410
1454
|
}
|
1411
1455
|
#elif defined(GGML_USE_SYCL)
|
1412
|
-
|
1456
|
+
if (host_buffer) {
|
1457
|
+
buft = ggml_backend_sycl_host_buffer_type();
|
1458
|
+
}
|
1413
1459
|
#elif defined(GGML_USE_CPU_HBM)
|
1414
1460
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
1415
1461
|
#elif defined(GGML_USE_VULKAN)
|
@@ -1463,6 +1509,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1463
1509
|
}
|
1464
1510
|
#endif
|
1465
1511
|
|
1512
|
+
#ifdef GGML_USE_SYCL
|
1513
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
1514
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1515
|
+
}
|
1516
|
+
#endif
|
1517
|
+
|
1466
1518
|
if (buft == nullptr) {
|
1467
1519
|
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1468
1520
|
}
|
@@ -1474,6 +1526,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1474
1526
|
static size_t llama_get_device_count() {
|
1475
1527
|
#if defined(GGML_USE_CUBLAS)
|
1476
1528
|
return ggml_backend_cuda_get_device_count();
|
1529
|
+
#elif defined(GGML_USE_SYCL)
|
1530
|
+
return ggml_backend_sycl_get_device_count();
|
1477
1531
|
#elif defined(GGML_USE_VULKAN)
|
1478
1532
|
return ggml_backend_vk_get_device_count();
|
1479
1533
|
#else
|
@@ -1487,6 +1541,11 @@ static size_t llama_get_device_memory(int device) {
|
|
1487
1541
|
size_t free;
|
1488
1542
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
1489
1543
|
return free;
|
1544
|
+
#elif defined(GGML_USE_SYCL)
|
1545
|
+
size_t total;
|
1546
|
+
size_t free;
|
1547
|
+
ggml_backend_sycl_get_device_memory(device, &total, &free);
|
1548
|
+
return free;
|
1490
1549
|
#elif defined(GGML_USE_VULKAN)
|
1491
1550
|
size_t total;
|
1492
1551
|
size_t free;
|
@@ -1575,7 +1634,12 @@ struct llama_hparams {
|
|
1575
1634
|
float rope_freq_base_train;
|
1576
1635
|
float rope_freq_scale_train;
|
1577
1636
|
uint32_t n_yarn_orig_ctx;
|
1578
|
-
|
1637
|
+
|
1638
|
+
// for State Space Models
|
1639
|
+
uint32_t ssm_d_conv = 0;
|
1640
|
+
uint32_t ssm_d_inner = 0;
|
1641
|
+
uint32_t ssm_d_state = 0;
|
1642
|
+
uint32_t ssm_dt_rank = 0;
|
1579
1643
|
|
1580
1644
|
float f_clamp_kqv = 0.0f;
|
1581
1645
|
float f_max_alibi_bias = 0.0f;
|
@@ -1583,8 +1647,9 @@ struct llama_hparams {
|
|
1583
1647
|
bool causal_attn = true;
|
1584
1648
|
bool need_kq_pos = false;
|
1585
1649
|
|
1586
|
-
enum llama_pooling_type
|
1587
|
-
enum llama_rope_type
|
1650
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1651
|
+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
1652
|
+
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
1588
1653
|
|
1589
1654
|
bool operator!=(const llama_hparams & other) const {
|
1590
1655
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1604,6 +1669,11 @@ struct llama_hparams {
|
|
1604
1669
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1605
1670
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1606
1671
|
|
1672
|
+
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
1673
|
+
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
1674
|
+
if (this->ssm_d_state != other.ssm_d_state) return true;
|
1675
|
+
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
1676
|
+
|
1607
1677
|
const float EPSILON = 1e-9f;
|
1608
1678
|
|
1609
1679
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
@@ -1615,6 +1685,9 @@ struct llama_hparams {
|
|
1615
1685
|
}
|
1616
1686
|
|
1617
1687
|
uint32_t n_gqa() const {
|
1688
|
+
if (n_head_kv == 0) {
|
1689
|
+
return 0;
|
1690
|
+
}
|
1618
1691
|
return n_head/n_head_kv;
|
1619
1692
|
}
|
1620
1693
|
|
@@ -1625,16 +1698,29 @@ struct llama_hparams {
|
|
1625
1698
|
uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
|
1626
1699
|
return n_embd_head_v * n_head_kv;
|
1627
1700
|
}
|
1701
|
+
|
1702
|
+
uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
|
1703
|
+
// corresponds to Mamba's conv_states size
|
1704
|
+
// TODO: maybe support other convolution strides than 1
|
1705
|
+
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
1706
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
1710
|
+
// corresponds to Mamba's ssm_states size
|
1711
|
+
return ssm_d_state * ssm_d_inner;
|
1712
|
+
}
|
1628
1713
|
};
|
1629
1714
|
|
1630
1715
|
struct llama_cparams {
|
1631
|
-
uint32_t n_ctx;
|
1716
|
+
uint32_t n_ctx; // context size used during inference
|
1632
1717
|
uint32_t n_batch;
|
1718
|
+
uint32_t n_ubatch;
|
1633
1719
|
uint32_t n_threads; // number of threads to use for generation
|
1634
1720
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1635
1721
|
|
1636
|
-
float
|
1637
|
-
float
|
1722
|
+
float rope_freq_base;
|
1723
|
+
float rope_freq_scale;
|
1638
1724
|
|
1639
1725
|
uint32_t n_yarn_orig_ctx;
|
1640
1726
|
// These hyperparameters are not exposed in GGUF, because all
|
@@ -1645,8 +1731,11 @@ struct llama_cparams {
|
|
1645
1731
|
float yarn_beta_slow;
|
1646
1732
|
float defrag_thold;
|
1647
1733
|
|
1734
|
+
bool embeddings;
|
1735
|
+
bool causal_attn;
|
1648
1736
|
bool offload_kqv;
|
1649
|
-
|
1737
|
+
|
1738
|
+
enum llama_pooling_type pooling_type;
|
1650
1739
|
|
1651
1740
|
ggml_backend_sched_eval_callback cb_eval;
|
1652
1741
|
void * cb_eval_user_data;
|
@@ -1700,11 +1789,27 @@ struct llama_layer {
|
|
1700
1789
|
struct ggml_tensor * ffn_down_b; // b2
|
1701
1790
|
struct ggml_tensor * ffn_up_b; // b3
|
1702
1791
|
struct ggml_tensor * ffn_act;
|
1792
|
+
|
1793
|
+
// mamba proj
|
1794
|
+
struct ggml_tensor * ssm_in;
|
1795
|
+
struct ggml_tensor * ssm_x;
|
1796
|
+
struct ggml_tensor * ssm_dt;
|
1797
|
+
struct ggml_tensor * ssm_out;
|
1798
|
+
|
1799
|
+
// mamba
|
1800
|
+
struct ggml_tensor * ssm_conv1d;
|
1801
|
+
struct ggml_tensor * ssm_a;
|
1802
|
+
struct ggml_tensor * ssm_d;
|
1803
|
+
|
1804
|
+
// mamba bias
|
1805
|
+
struct ggml_tensor * ssm_conv1d_b;
|
1806
|
+
struct ggml_tensor * ssm_dt_b;
|
1703
1807
|
};
|
1704
1808
|
|
1705
1809
|
struct llama_kv_cell {
|
1706
1810
|
llama_pos pos = -1;
|
1707
1811
|
llama_pos delta = 0;
|
1812
|
+
int32_t src = 0; // used by recurrent state models to copy states
|
1708
1813
|
|
1709
1814
|
std::set<llama_seq_id> seq_id;
|
1710
1815
|
|
@@ -1725,6 +1830,9 @@ struct llama_kv_cell {
|
|
1725
1830
|
struct llama_kv_cache {
|
1726
1831
|
bool has_shift = false;
|
1727
1832
|
bool do_defrag = false;
|
1833
|
+
bool do_copy = false;
|
1834
|
+
// with recurrent state models, a cell can hold the state for more than one past token
|
1835
|
+
bool recurrent = false;
|
1728
1836
|
|
1729
1837
|
// Note: The value of head isn't only used to optimize searching
|
1730
1838
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -1904,8 +2012,7 @@ struct llama_context {
|
|
1904
2012
|
ggml_vk_free_cpu_assist();
|
1905
2013
|
#endif
|
1906
2014
|
|
1907
|
-
ggml_backend_buffer_free(
|
1908
|
-
ggml_free(ctx_input);
|
2015
|
+
ggml_backend_buffer_free(buf_output);
|
1909
2016
|
}
|
1910
2017
|
|
1911
2018
|
llama_cparams cparams;
|
@@ -1931,36 +2038,54 @@ struct llama_context {
|
|
1931
2038
|
int64_t t_p_eval_us = 0;
|
1932
2039
|
int64_t t_eval_us = 0;
|
1933
2040
|
|
2041
|
+
int64_t t_compute_start_us = 0;
|
2042
|
+
int64_t n_queued_tokens = 0;
|
2043
|
+
|
1934
2044
|
int32_t n_sample = 0; // number of tokens sampled
|
1935
2045
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1936
2046
|
int32_t n_eval = 0; // number of eval calls
|
1937
2047
|
|
2048
|
+
// host buffer for the model output (logits and embeddings)
|
2049
|
+
ggml_backend_buffer_t buf_output = nullptr;
|
2050
|
+
|
1938
2051
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1939
|
-
|
2052
|
+
size_t logits_size = 0;
|
2053
|
+
float * logits = nullptr;
|
2054
|
+
|
1940
2055
|
#ifndef NDEBUG
|
1941
2056
|
// guard against access to unset logits
|
1942
2057
|
std::vector<bool> logits_valid;
|
1943
2058
|
#endif
|
1944
2059
|
bool logits_all = false;
|
1945
2060
|
|
1946
|
-
//
|
1947
|
-
|
2061
|
+
// embeddings output (2-dimensional array: [n_tokens][n_embd])
|
2062
|
+
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
2063
|
+
size_t embd_size = 0;
|
2064
|
+
float * embd = nullptr;
|
2065
|
+
|
2066
|
+
// sequence embeddings output (map of [n_embd] vectors)
|
2067
|
+
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
2068
|
+
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
1948
2069
|
|
1949
2070
|
// memory buffers used to evaluate the model
|
1950
2071
|
std::vector<uint8_t> buf_compute_meta;
|
1951
2072
|
ggml_backend_sched_t sched = nullptr;
|
1952
2073
|
|
2074
|
+
ggml_abort_callback abort_callback = nullptr;
|
2075
|
+
void * abort_callback_data = nullptr;
|
2076
|
+
|
1953
2077
|
// input tensors
|
1954
|
-
ggml_backend_buffer_t buf_input = nullptr;
|
1955
|
-
ggml_context * ctx_input = nullptr;
|
1956
2078
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
1957
2079
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
1958
2080
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1959
|
-
struct ggml_tensor * inp_KQ_mask; // F32 [
|
1960
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [
|
1961
|
-
struct ggml_tensor * inp_K_shift; // I32 [
|
2081
|
+
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2082
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
|
2083
|
+
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
1962
2084
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
1963
2085
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
2086
|
+
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
2087
|
+
struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
|
2088
|
+
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
|
1964
2089
|
|
1965
2090
|
#ifdef GGML_USE_MPI
|
1966
2091
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -1976,25 +2101,42 @@ static bool llama_kv_cache_init(
|
|
1976
2101
|
const llama_model & model,
|
1977
2102
|
ggml_type type_k,
|
1978
2103
|
ggml_type type_v,
|
1979
|
-
uint32_t
|
2104
|
+
uint32_t kv_size,
|
1980
2105
|
bool offload) {
|
1981
2106
|
const struct llama_hparams & hparams = model.hparams;
|
1982
2107
|
|
1983
|
-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
1984
|
-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
2108
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
2109
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
1985
2110
|
const int64_t n_layer = hparams.n_layer;
|
1986
2111
|
|
1987
2112
|
cache.has_shift = false;
|
1988
2113
|
|
2114
|
+
// TODO: find a nicer way to add other recurrent model architectures
|
2115
|
+
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
2116
|
+
|
2117
|
+
// TODO: support mixed reccurent Transformer architectues
|
2118
|
+
// NOTE: (!a || b) is a logical implication (a -> b)
|
2119
|
+
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
2120
|
+
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
2121
|
+
GGML_ASSERT( cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_gqa());
|
2122
|
+
GGML_ASSERT( cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_gqa());
|
2123
|
+
|
1989
2124
|
cache.head = 0;
|
1990
|
-
cache.size =
|
2125
|
+
cache.size = kv_size;
|
1991
2126
|
cache.used = 0;
|
1992
2127
|
|
1993
2128
|
cache.type_k = type_k;
|
1994
2129
|
cache.type_v = type_v;
|
1995
2130
|
|
1996
2131
|
cache.cells.clear();
|
1997
|
-
cache.cells.resize(
|
2132
|
+
cache.cells.resize(kv_size);
|
2133
|
+
|
2134
|
+
if (cache.recurrent) {
|
2135
|
+
// init state copy sources
|
2136
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
2137
|
+
cache.cells[i].src = i;
|
2138
|
+
}
|
2139
|
+
}
|
1998
2140
|
|
1999
2141
|
#ifdef GGML_USE_CLBLAST
|
2000
2142
|
offload = false;
|
@@ -2033,8 +2175,8 @@ static bool llama_kv_cache_init(
|
|
2033
2175
|
|
2034
2176
|
for (int i = 0; i < (int) n_layer; i++) {
|
2035
2177
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
2036
|
-
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*
|
2037
|
-
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*
|
2178
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
2179
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
2038
2180
|
ggml_format_name(k, "cache_k_l%d", i);
|
2039
2181
|
ggml_format_name(v, "cache_v_l%d", i);
|
2040
2182
|
cache.k_l.push_back(k);
|
@@ -2068,6 +2210,54 @@ static bool llama_kv_cache_find_slot(
|
|
2068
2210
|
const uint32_t n_ctx = cache.size;
|
2069
2211
|
const uint32_t n_tokens = batch.n_tokens;
|
2070
2212
|
|
2213
|
+
if (cache.recurrent) {
|
2214
|
+
// For recurrent state architectures (like Mamba),
|
2215
|
+
// each KV cache cell can store the state for a whole sequence.
|
2216
|
+
|
2217
|
+
llama_seq_id min = cache.size - 1;
|
2218
|
+
llama_seq_id max = 0;
|
2219
|
+
|
2220
|
+
for (uint32_t i = 0; i < n_tokens; ++i) {
|
2221
|
+
for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) {
|
2222
|
+
llama_seq_id seq_id = batch.seq_id[i][j];
|
2223
|
+
// make sure it's a valid seq_id
|
2224
|
+
if ((uint32_t) seq_id < cache.size) {
|
2225
|
+
if (seq_id > max) {
|
2226
|
+
max = seq_id;
|
2227
|
+
}
|
2228
|
+
if (seq_id < min) {
|
2229
|
+
min = seq_id;
|
2230
|
+
}
|
2231
|
+
// Assuming the tokens are in-order
|
2232
|
+
if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
|
2233
|
+
// What should happen when the pos backtracks or skips a value?
|
2234
|
+
// Clearing the state mid-batch would require special-casing which isn't done.
|
2235
|
+
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
|
2236
|
+
__func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
|
2237
|
+
}
|
2238
|
+
if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) {
|
2239
|
+
cache.used += 1;
|
2240
|
+
}
|
2241
|
+
cache.cells[seq_id].pos = batch.pos[i];
|
2242
|
+
// NOTE: seq_ids are not inserted here; they are handled when the input tensors are set
|
2243
|
+
} else {
|
2244
|
+
// too big seq_id
|
2245
|
+
// TODO: would it be possible to resize the KV cache size instead?
|
2246
|
+
LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
|
2247
|
+
return false;
|
2248
|
+
}
|
2249
|
+
}
|
2250
|
+
}
|
2251
|
+
|
2252
|
+
// allow getting the range of used cells, from head to head + n
|
2253
|
+
cache.head = min;
|
2254
|
+
cache.n = max - min + 1;
|
2255
|
+
|
2256
|
+
// sanity check
|
2257
|
+
return max >= min;
|
2258
|
+
}
|
2259
|
+
// otherwise, one cell per token.
|
2260
|
+
|
2071
2261
|
if (n_tokens > n_ctx) {
|
2072
2262
|
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
2073
2263
|
return false;
|
@@ -2116,10 +2306,12 @@ static bool llama_kv_cache_find_slot(
|
|
2116
2306
|
}
|
2117
2307
|
|
2118
2308
|
// find how many cells are currently in use
|
2119
|
-
static
|
2120
|
-
for (uint32_t i = cache.size
|
2121
|
-
|
2122
|
-
|
2309
|
+
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
2310
|
+
for (uint32_t i = cache.size; i > 0; --i) {
|
2311
|
+
const llama_kv_cell & cell = cache.cells[i - 1];
|
2312
|
+
|
2313
|
+
if (cell.pos >= 0 && !cell.is_empty()) {
|
2314
|
+
return i;
|
2123
2315
|
}
|
2124
2316
|
}
|
2125
2317
|
|
@@ -2135,7 +2327,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
2135
2327
|
cache.used = 0;
|
2136
2328
|
}
|
2137
2329
|
|
2138
|
-
static
|
2330
|
+
static bool llama_kv_cache_seq_rm(
|
2139
2331
|
struct llama_kv_cache & cache,
|
2140
2332
|
llama_seq_id seq_id,
|
2141
2333
|
llama_pos p0,
|
@@ -2145,6 +2337,25 @@ static void llama_kv_cache_seq_rm(
|
|
2145
2337
|
if (p0 < 0) p0 = 0;
|
2146
2338
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
2147
2339
|
|
2340
|
+
// models like Mamba can't have a state partially erased
|
2341
|
+
if (cache.recurrent) {
|
2342
|
+
if (seq_id >= (int64_t) cache.size) {
|
2343
|
+
// could be fatal
|
2344
|
+
return false;
|
2345
|
+
}
|
2346
|
+
if (0 <= seq_id) {
|
2347
|
+
// partial intersection is invalid
|
2348
|
+
if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) {
|
2349
|
+
return false;
|
2350
|
+
}
|
2351
|
+
} else {
|
2352
|
+
// seq_id is negative, then the range should include everything or nothing
|
2353
|
+
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
|
2354
|
+
return false;
|
2355
|
+
}
|
2356
|
+
}
|
2357
|
+
}
|
2358
|
+
|
2148
2359
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
2149
2360
|
if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
2150
2361
|
if (seq_id < 0) {
|
@@ -2166,6 +2377,8 @@ static void llama_kv_cache_seq_rm(
|
|
2166
2377
|
|
2167
2378
|
// If we freed up a slot, set head to it so searching can start there.
|
2168
2379
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
2380
|
+
|
2381
|
+
return true;
|
2169
2382
|
}
|
2170
2383
|
|
2171
2384
|
static void llama_kv_cache_seq_cp(
|
@@ -2177,6 +2390,29 @@ static void llama_kv_cache_seq_cp(
|
|
2177
2390
|
if (p0 < 0) p0 = 0;
|
2178
2391
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
2179
2392
|
|
2393
|
+
if (cache.recurrent) {
|
2394
|
+
if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
|
2395
|
+
seq_id_src = cache.cells[seq_id_src].src;
|
2396
|
+
GGML_ASSERT((uint32_t) seq_id_src < cache.size);
|
2397
|
+
// intent to "copy from"
|
2398
|
+
// supports copy chains thanks to taking the source of the source
|
2399
|
+
cache.cells[seq_id_dst].src = seq_id_src;
|
2400
|
+
|
2401
|
+
// preserve the "keep or clear" status of the copied sequence
|
2402
|
+
if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) {
|
2403
|
+
cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
|
2404
|
+
} else {
|
2405
|
+
cache.cells[seq_id_dst].seq_id.erase(seq_id_dst);
|
2406
|
+
}
|
2407
|
+
|
2408
|
+
cache.do_copy = true;
|
2409
|
+
|
2410
|
+
cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
|
2411
|
+
}
|
2412
|
+
return;
|
2413
|
+
}
|
2414
|
+
// otherwise, this is the KV cache of a Transformer-like model
|
2415
|
+
|
2180
2416
|
cache.head = 0;
|
2181
2417
|
|
2182
2418
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
@@ -2216,6 +2452,17 @@ static void llama_kv_cache_seq_add(
|
|
2216
2452
|
if (p0 < 0) p0 = 0;
|
2217
2453
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
2218
2454
|
|
2455
|
+
if (cache.recurrent) {
|
2456
|
+
// for Mamba-like models, only the pos needs to be shifted
|
2457
|
+
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
2458
|
+
llama_kv_cell & cell = cache.cells[seq_id];
|
2459
|
+
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
|
2460
|
+
cell.pos += delta;
|
2461
|
+
}
|
2462
|
+
}
|
2463
|
+
return;
|
2464
|
+
}
|
2465
|
+
|
2219
2466
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
2220
2467
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
2221
2468
|
cache.has_shift = true;
|
@@ -2249,6 +2496,17 @@ static void llama_kv_cache_seq_div(
|
|
2249
2496
|
if (p0 < 0) p0 = 0;
|
2250
2497
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
2251
2498
|
|
2499
|
+
if (cache.recurrent) {
|
2500
|
+
// for Mamba-like models, only the pos needs to be changed
|
2501
|
+
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
2502
|
+
llama_kv_cell & cell = cache.cells[seq_id];
|
2503
|
+
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
|
2504
|
+
cell.pos /= d;
|
2505
|
+
}
|
2506
|
+
}
|
2507
|
+
return;
|
2508
|
+
}
|
2509
|
+
|
2252
2510
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
2253
2511
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
2254
2512
|
cache.has_shift = true;
|
@@ -2891,7 +3149,11 @@ template<>
|
|
2891
3149
|
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
2892
3150
|
uint32_t tmp;
|
2893
3151
|
const bool found = get_key(kid, tmp, required);
|
2894
|
-
|
3152
|
+
if (found) {
|
3153
|
+
result = (enum llama_pooling_type) tmp;
|
3154
|
+
} else {
|
3155
|
+
result = LLAMA_POOLING_TYPE_UNSPECIFIED;
|
3156
|
+
}
|
2895
3157
|
return found;
|
2896
3158
|
}
|
2897
3159
|
|
@@ -2982,10 +3244,11 @@ static const char * llama_model_type_name(e_model type) {
|
|
2982
3244
|
|
2983
3245
|
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2984
3246
|
switch (type) {
|
2985
|
-
case
|
2986
|
-
case
|
2987
|
-
case
|
2988
|
-
|
3247
|
+
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
3248
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
3249
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
3250
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
3251
|
+
default: return "unknown";
|
2989
3252
|
}
|
2990
3253
|
}
|
2991
3254
|
|
@@ -3017,14 +3280,14 @@ static void llm_load_hparams(
|
|
3017
3280
|
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
3018
3281
|
|
3019
3282
|
// get hparams kv
|
3020
|
-
ml.get_arr_n(LLM_KV_TOKENIZER_LIST,
|
3021
|
-
ml.get_key
|
3022
|
-
ml.get_key
|
3023
|
-
ml.get_key
|
3024
|
-
ml.get_key
|
3025
|
-
ml.get_key
|
3026
|
-
ml.get_key
|
3027
|
-
ml.get_key
|
3283
|
+
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
3284
|
+
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
3285
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
3286
|
+
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
3287
|
+
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
3288
|
+
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
3289
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
3290
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
3028
3291
|
|
3029
3292
|
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
3030
3293
|
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
@@ -3064,7 +3327,7 @@ static void llm_load_hparams(
|
|
3064
3327
|
|
3065
3328
|
// sanity check for n_rot (optional)
|
3066
3329
|
{
|
3067
|
-
hparams.n_rot = hparams.n_embd / hparams.n_head;
|
3330
|
+
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
3068
3331
|
|
3069
3332
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
3070
3333
|
|
@@ -3077,10 +3340,10 @@ static void llm_load_hparams(
|
|
3077
3340
|
// gpt-j n_rot = rotary_dim
|
3078
3341
|
}
|
3079
3342
|
|
3080
|
-
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
|
3343
|
+
hparams.n_embd_head_k = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
3081
3344
|
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
3082
3345
|
|
3083
|
-
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
|
3346
|
+
hparams.n_embd_head_v = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
3084
3347
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
3085
3348
|
|
3086
3349
|
// arch-specific KVs
|
@@ -3168,7 +3431,7 @@ static void llm_load_hparams(
|
|
3168
3431
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3169
3432
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3170
3433
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3171
|
-
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3434
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
3172
3435
|
|
3173
3436
|
switch (hparams.n_layer) {
|
3174
3437
|
case 3:
|
@@ -3320,6 +3583,46 @@ static void llm_load_hparams(
|
|
3320
3583
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3321
3584
|
}
|
3322
3585
|
} break;
|
3586
|
+
case LLM_ARCH_STARCODER2:
|
3587
|
+
{
|
3588
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3589
|
+
switch (hparams.n_layer) {
|
3590
|
+
case 30: model.type = e_model::MODEL_3B; break;
|
3591
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3592
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
3593
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3594
|
+
}
|
3595
|
+
} break;
|
3596
|
+
case LLM_ARCH_MAMBA:
|
3597
|
+
{
|
3598
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
3599
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
3600
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
3601
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
3602
|
+
|
3603
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3604
|
+
|
3605
|
+
switch (hparams.n_layer) {
|
3606
|
+
case 24:
|
3607
|
+
switch (hparams.n_embd) {
|
3608
|
+
case 768: model.type = e_model::MODEL_SMALL; break;
|
3609
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3610
|
+
} break;
|
3611
|
+
case 48:
|
3612
|
+
switch (hparams.n_embd) {
|
3613
|
+
case 1024: model.type = e_model::MODEL_MEDIUM; break;
|
3614
|
+
case 1536: model.type = e_model::MODEL_LARGE; break;
|
3615
|
+
case 2048: model.type = e_model::MODEL_XL; break;
|
3616
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3617
|
+
} break;
|
3618
|
+
case 64:
|
3619
|
+
switch (hparams.n_embd) {
|
3620
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
3621
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3622
|
+
} break;
|
3623
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3624
|
+
}
|
3625
|
+
} break;
|
3323
3626
|
default: (void)0;
|
3324
3627
|
}
|
3325
3628
|
|
@@ -3345,30 +3648,25 @@ static void llm_load_vocab(
|
|
3345
3648
|
|
3346
3649
|
const auto kv = LLM_KV(model.arch);
|
3347
3650
|
|
3348
|
-
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
3349
|
-
if (token_idx == -1) {
|
3350
|
-
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
3351
|
-
}
|
3352
|
-
|
3353
|
-
const float * scores = nullptr;
|
3354
|
-
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
3355
|
-
if (score_idx != -1) {
|
3356
|
-
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
3357
|
-
}
|
3358
|
-
|
3359
|
-
const int * toktypes = nullptr;
|
3360
|
-
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
3361
|
-
if (toktype_idx != -1) {
|
3362
|
-
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
3363
|
-
}
|
3364
|
-
|
3365
3651
|
// determine vocab type
|
3366
3652
|
{
|
3367
3653
|
std::string tokenizer_name;
|
3368
3654
|
|
3369
3655
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
3370
3656
|
|
3371
|
-
if (tokenizer_name == "
|
3657
|
+
if (tokenizer_name == "no_vocab") {
|
3658
|
+
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
3659
|
+
|
3660
|
+
// default special tokens
|
3661
|
+
vocab.special_bos_id = -1;
|
3662
|
+
vocab.special_eos_id = -1;
|
3663
|
+
vocab.special_unk_id = -1;
|
3664
|
+
vocab.special_sep_id = -1;
|
3665
|
+
vocab.special_pad_id = -1;
|
3666
|
+
vocab.linefeed_id = -1;
|
3667
|
+
|
3668
|
+
return;
|
3669
|
+
} else if (tokenizer_name == "llama") {
|
3372
3670
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
3373
3671
|
|
3374
3672
|
// default special tokens
|
@@ -3395,7 +3693,7 @@ static void llm_load_vocab(
|
|
3395
3693
|
|
3396
3694
|
for (int i = 0; i < n_merges; i++) {
|
3397
3695
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
3398
|
-
GGML_ASSERT(
|
3696
|
+
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
3399
3697
|
|
3400
3698
|
std::string first;
|
3401
3699
|
std::string second;
|
@@ -3434,13 +3732,30 @@ static void llm_load_vocab(
|
|
3434
3732
|
}
|
3435
3733
|
}
|
3436
3734
|
|
3735
|
+
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
3736
|
+
if (token_idx == -1) {
|
3737
|
+
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
3738
|
+
}
|
3739
|
+
|
3740
|
+
const float * scores = nullptr;
|
3741
|
+
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
3742
|
+
if (score_idx != -1) {
|
3743
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
3744
|
+
}
|
3745
|
+
|
3746
|
+
const int * toktypes = nullptr;
|
3747
|
+
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
3748
|
+
if (toktype_idx != -1) {
|
3749
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
3750
|
+
}
|
3751
|
+
|
3437
3752
|
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
3438
3753
|
|
3439
3754
|
vocab.id_to_token.resize(n_vocab);
|
3440
3755
|
|
3441
3756
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
3442
3757
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
3443
|
-
GGML_ASSERT(
|
3758
|
+
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
3444
3759
|
|
3445
3760
|
vocab.token_to_id[word] = i;
|
3446
3761
|
|
@@ -3632,6 +3947,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3632
3947
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3633
3948
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3634
3949
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
3950
|
+
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
3635
3951
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
3636
3952
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
3637
3953
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
@@ -3639,6 +3955,10 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3639
3955
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
3640
3956
|
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
3641
3957
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
3958
|
+
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
3959
|
+
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
3960
|
+
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
3961
|
+
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
3642
3962
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
3643
3963
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
3644
3964
|
if (ml.n_elements >= 1e12) {
|
@@ -3692,6 +4012,7 @@ static bool llm_load_tensors(
|
|
3692
4012
|
|
3693
4013
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
3694
4014
|
model.buft_input = llama_default_buffer_type_cpu(true);
|
4015
|
+
//model.buft_input = llama_default_buffer_type_offload(main_gpu);
|
3695
4016
|
|
3696
4017
|
model.buft_layer.resize(n_layer);
|
3697
4018
|
|
@@ -3825,7 +4146,13 @@ static bool llm_load_tensors(
|
|
3825
4146
|
{
|
3826
4147
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3827
4148
|
if (model.arch != LLM_ARCH_MINICPM){
|
3828
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,
|
4149
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4150
|
+
// if output is NULL, init from the input tok embed
|
4151
|
+
if (model.output == NULL) {
|
4152
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4153
|
+
ml.n_created--; // artificial tensor
|
4154
|
+
ml.size_data += ggml_nbytes(model.output);
|
4155
|
+
}
|
3829
4156
|
}
|
3830
4157
|
}
|
3831
4158
|
|
@@ -4490,6 +4817,107 @@ static bool llm_load_tensors(
|
|
4490
4817
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4491
4818
|
}
|
4492
4819
|
} break;
|
4820
|
+
case LLM_ARCH_STARCODER2:
|
4821
|
+
{
|
4822
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4823
|
+
|
4824
|
+
// output
|
4825
|
+
{
|
4826
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4827
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4828
|
+
|
4829
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4830
|
+
// if output is NULL, init from the input tok embed
|
4831
|
+
if (model.output == NULL) {
|
4832
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4833
|
+
ml.n_created--; // artificial tensor
|
4834
|
+
ml.size_data += ggml_nbytes(model.output);
|
4835
|
+
}
|
4836
|
+
|
4837
|
+
}
|
4838
|
+
|
4839
|
+
for (int i = 0; i < n_layer; ++i) {
|
4840
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4841
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4842
|
+
|
4843
|
+
auto & layer = model.layers[i];
|
4844
|
+
|
4845
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4846
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4847
|
+
|
4848
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4849
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4850
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4851
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4852
|
+
|
4853
|
+
// optional bias tensors
|
4854
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
4855
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
4856
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
4857
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
4858
|
+
|
4859
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4860
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
4861
|
+
|
4862
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4863
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4864
|
+
|
4865
|
+
// optional bias tensors
|
4866
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
4867
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
|
4868
|
+
}
|
4869
|
+
} break;
|
4870
|
+
case LLM_ARCH_MAMBA:
|
4871
|
+
{
|
4872
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
4873
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
4874
|
+
const int64_t d_state = hparams.ssm_d_state;
|
4875
|
+
const int64_t dt_rank = hparams.ssm_dt_rank;
|
4876
|
+
// only an expansion factor of 2 is supported for now
|
4877
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
4878
|
+
|
4879
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4880
|
+
|
4881
|
+
// output
|
4882
|
+
{
|
4883
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4884
|
+
|
4885
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4886
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
4887
|
+
if (model.output == NULL) {
|
4888
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4889
|
+
ml.n_created--; // artificial tensor
|
4890
|
+
ml.size_data += ggml_nbytes(model.output);
|
4891
|
+
}
|
4892
|
+
}
|
4893
|
+
|
4894
|
+
for (int i = 0; i < n_layer; ++i) {
|
4895
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4896
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4897
|
+
|
4898
|
+
auto & layer = model.layers[i];
|
4899
|
+
|
4900
|
+
// norm
|
4901
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4902
|
+
|
4903
|
+
layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner});
|
4904
|
+
|
4905
|
+
layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner});
|
4906
|
+
layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner});
|
4907
|
+
|
4908
|
+
layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state});
|
4909
|
+
|
4910
|
+
layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner});
|
4911
|
+
layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner});
|
4912
|
+
|
4913
|
+
// no "weight" suffix for these
|
4914
|
+
layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner});
|
4915
|
+
layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, i), {d_inner});
|
4916
|
+
|
4917
|
+
// out_proj
|
4918
|
+
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
4919
|
+
}
|
4920
|
+
} break;
|
4493
4921
|
default:
|
4494
4922
|
throw std::runtime_error("unknown architecture");
|
4495
4923
|
}
|
@@ -4610,7 +5038,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4610
5038
|
|
4611
5039
|
llm_load_print_meta(ml, model);
|
4612
5040
|
|
4613
|
-
if (model.
|
5041
|
+
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
5042
|
+
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
4614
5043
|
throw std::runtime_error("vocab size mismatch");
|
4615
5044
|
}
|
4616
5045
|
|
@@ -4674,29 +5103,32 @@ enum llm_norm_type {
|
|
4674
5103
|
|
4675
5104
|
static struct ggml_tensor * llm_build_inp_embd(
|
4676
5105
|
struct ggml_context * ctx,
|
5106
|
+
struct llama_context & lctx,
|
4677
5107
|
const llama_hparams & hparams,
|
4678
5108
|
const llama_batch & batch,
|
4679
5109
|
struct ggml_tensor * tok_embd,
|
4680
|
-
struct ggml_tensor * inp_tokens,
|
4681
|
-
struct ggml_tensor * inp_embd,
|
4682
5110
|
const llm_build_cb & cb) {
|
4683
5111
|
const int64_t n_embd = hparams.n_embd;
|
4684
5112
|
|
4685
5113
|
struct ggml_tensor * inpL;
|
4686
5114
|
|
4687
5115
|
if (batch.token) {
|
4688
|
-
|
4689
|
-
cb(inp_tokens, "inp_tokens", -1);
|
5116
|
+
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
5117
|
+
cb(lctx.inp_tokens, "inp_tokens", -1);
|
5118
|
+
ggml_set_input(lctx.inp_tokens);
|
4690
5119
|
|
4691
|
-
inpL = ggml_get_rows(ctx, tok_embd,
|
5120
|
+
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
4692
5121
|
} else {
|
4693
5122
|
#ifdef GGML_USE_MPI
|
4694
5123
|
GGML_ASSERT(false && "not implemented");
|
4695
5124
|
#endif
|
4696
|
-
|
4697
|
-
inpL =
|
5125
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
5126
|
+
inpL = lctx.inp_embd;
|
5127
|
+
ggml_set_input(lctx.inp_embd);
|
4698
5128
|
}
|
4699
5129
|
|
5130
|
+
cb(inpL, "inp_embd", -1);
|
5131
|
+
|
4700
5132
|
return inpL;
|
4701
5133
|
}
|
4702
5134
|
|
@@ -4715,6 +5147,8 @@ static void llm_build_kv_store(
|
|
4715
5147
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4716
5148
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4717
5149
|
|
5150
|
+
GGML_ASSERT(kv.size == n_ctx);
|
5151
|
+
|
4718
5152
|
// compute the transposed [n_tokens, n_embd] V matrix
|
4719
5153
|
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
|
4720
5154
|
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
@@ -4901,8 +5335,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4901
5335
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4902
5336
|
}
|
4903
5337
|
|
4904
|
-
#if defined(
|
4905
|
-
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for
|
5338
|
+
#if defined(GGML_USE_KOMPUTE)
|
5339
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
4906
5340
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4907
5341
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4908
5342
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
@@ -4924,6 +5358,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4924
5358
|
cb(kq, "kq_soft_max_ext", il);
|
4925
5359
|
}
|
4926
5360
|
|
5361
|
+
GGML_ASSERT(kv.size == n_ctx);
|
5362
|
+
|
4927
5363
|
// split cached v into n_head heads
|
4928
5364
|
struct ggml_tensor * v =
|
4929
5365
|
ggml_view_3d(ctx, kv.v_l[il],
|
@@ -4986,6 +5422,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
4986
5422
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
4987
5423
|
|
4988
5424
|
struct ggml_tensor * cur;
|
5425
|
+
|
4989
5426
|
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
4990
5427
|
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
4991
5428
|
cb(cur, "kqv_out", il);
|
@@ -4995,7 +5432,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
4995
5432
|
|
4996
5433
|
struct llm_build_context {
|
4997
5434
|
const llama_model & model;
|
4998
|
-
|
5435
|
+
llama_context & lctx;
|
4999
5436
|
const llama_hparams & hparams;
|
5000
5437
|
const llama_cparams & cparams;
|
5001
5438
|
const llama_batch & batch;
|
@@ -5070,10 +5507,10 @@ struct llm_build_context {
|
|
5070
5507
|
norm_eps (hparams.f_norm_eps),
|
5071
5508
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
5072
5509
|
n_tokens (batch.n_tokens),
|
5073
|
-
n_kv (worst_case ?
|
5074
|
-
kv_head (worst_case ?
|
5510
|
+
n_kv (worst_case ? kv_self.size : kv_self.n),
|
5511
|
+
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
5075
5512
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5076
|
-
pooling_type (cparams.
|
5513
|
+
pooling_type (cparams.pooling_type),
|
5077
5514
|
rope_type (hparams.rope_type),
|
5078
5515
|
cb (cb),
|
5079
5516
|
buf_compute_meta (lctx.buf_compute_meta) {
|
@@ -5088,6 +5525,18 @@ struct llm_build_context {
|
|
5088
5525
|
};
|
5089
5526
|
|
5090
5527
|
ctx0 = ggml_init(params);
|
5528
|
+
|
5529
|
+
lctx.inp_tokens = nullptr;
|
5530
|
+
lctx.inp_embd = nullptr;
|
5531
|
+
lctx.inp_pos = nullptr;
|
5532
|
+
lctx.inp_KQ_mask = nullptr;
|
5533
|
+
lctx.inp_KQ_pos = nullptr;
|
5534
|
+
lctx.inp_K_shift = nullptr;
|
5535
|
+
lctx.inp_mean = nullptr;
|
5536
|
+
lctx.inp_cls = nullptr;
|
5537
|
+
lctx.inp_s_copy = nullptr;
|
5538
|
+
lctx.inp_s_mask = nullptr;
|
5539
|
+
lctx.inp_s_seq = nullptr;
|
5091
5540
|
}
|
5092
5541
|
|
5093
5542
|
void free() {
|
@@ -5100,6 +5549,12 @@ struct llm_build_context {
|
|
5100
5549
|
struct ggml_cgraph * build_k_shift() {
|
5101
5550
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5102
5551
|
|
5552
|
+
GGML_ASSERT(kv_self.size == n_ctx);
|
5553
|
+
|
5554
|
+
lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
5555
|
+
cb(lctx.inp_K_shift, "K_shift", -1);
|
5556
|
+
ggml_set_input(lctx.inp_K_shift);
|
5557
|
+
|
5103
5558
|
for (int il = 0; il < n_layer; ++il) {
|
5104
5559
|
struct ggml_tensor * tmp =
|
5105
5560
|
// we rotate only the first n_rot dimensions
|
@@ -5118,6 +5573,29 @@ struct llm_build_context {
|
|
5118
5573
|
return gf;
|
5119
5574
|
}
|
5120
5575
|
|
5576
|
+
struct ggml_cgraph * build_s_copy() {
|
5577
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5578
|
+
|
5579
|
+
GGML_ASSERT(kv_self.recurrent);
|
5580
|
+
|
5581
|
+
struct ggml_tensor * state_copy = build_inp_s_copy();
|
5582
|
+
|
5583
|
+
for (int il = 0; il < n_layer; ++il) {
|
5584
|
+
struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
|
5585
|
+
struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
|
5586
|
+
|
5587
|
+
conv_states = ggml_get_rows(ctx0, conv_states, state_copy);
|
5588
|
+
ssm_states = ggml_get_rows(ctx0, ssm_states, state_copy);
|
5589
|
+
|
5590
|
+
// TODO: name the intermediate tensors with cb()
|
5591
|
+
|
5592
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_states, kv_self.k_l[il]));
|
5593
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, ssm_states, kv_self.v_l[il]));
|
5594
|
+
}
|
5595
|
+
|
5596
|
+
return gf;
|
5597
|
+
}
|
5598
|
+
|
5121
5599
|
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
5122
5600
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5123
5601
|
|
@@ -5167,6 +5645,66 @@ struct llm_build_context {
|
|
5167
5645
|
return gf;
|
5168
5646
|
}
|
5169
5647
|
|
5648
|
+
struct ggml_tensor * build_inp_pos() {
|
5649
|
+
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5650
|
+
cb(lctx.inp_pos, "inp_pos", -1);
|
5651
|
+
ggml_set_input(lctx.inp_pos);
|
5652
|
+
return lctx.inp_pos;
|
5653
|
+
}
|
5654
|
+
|
5655
|
+
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
5656
|
+
if (causal) {
|
5657
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
|
5658
|
+
} else {
|
5659
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
5660
|
+
}
|
5661
|
+
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
5662
|
+
ggml_set_input(lctx.inp_KQ_mask);
|
5663
|
+
return lctx.inp_KQ_mask;
|
5664
|
+
}
|
5665
|
+
|
5666
|
+
struct ggml_tensor * build_inp_KQ_pos() {
|
5667
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
5668
|
+
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
5669
|
+
ggml_set_input(lctx.inp_KQ_pos);
|
5670
|
+
return lctx.inp_KQ_pos;
|
5671
|
+
}
|
5672
|
+
|
5673
|
+
struct ggml_tensor * build_inp_mean() {
|
5674
|
+
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
5675
|
+
cb(lctx.inp_mean, "inp_mean", -1);
|
5676
|
+
ggml_set_input(lctx.inp_mean);
|
5677
|
+
return lctx.inp_mean;
|
5678
|
+
}
|
5679
|
+
|
5680
|
+
struct ggml_tensor * build_inp_cls() {
|
5681
|
+
lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5682
|
+
cb(lctx.inp_cls, "inp_cls", -1);
|
5683
|
+
ggml_set_input(lctx.inp_cls);
|
5684
|
+
return lctx.inp_cls;
|
5685
|
+
}
|
5686
|
+
|
5687
|
+
struct ggml_tensor * build_inp_s_copy() {
|
5688
|
+
lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, kv_self.size);
|
5689
|
+
cb(lctx.inp_s_copy, "inp_s_copy", -1);
|
5690
|
+
ggml_set_input(lctx.inp_s_copy);
|
5691
|
+
return lctx.inp_s_copy;
|
5692
|
+
}
|
5693
|
+
|
5694
|
+
struct ggml_tensor * build_inp_s_mask() {
|
5695
|
+
lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
|
5696
|
+
cb(lctx.inp_s_mask, "inp_s_mask", -1);
|
5697
|
+
ggml_set_input(lctx.inp_s_mask);
|
5698
|
+
return lctx.inp_s_mask;
|
5699
|
+
}
|
5700
|
+
|
5701
|
+
struct ggml_tensor * build_inp_s_seq() {
|
5702
|
+
lctx.inp_s_seq = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
|
5703
|
+
cb(lctx.inp_s_seq, "inp_s_seq", -1);
|
5704
|
+
ggml_set_input(lctx.inp_s_seq);
|
5705
|
+
return lctx.inp_s_seq;
|
5706
|
+
}
|
5707
|
+
|
5170
5708
|
struct ggml_cgraph * build_llama() {
|
5171
5709
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5172
5710
|
|
@@ -5177,16 +5715,13 @@ struct llm_build_context {
|
|
5177
5715
|
struct ggml_tensor * cur;
|
5178
5716
|
struct ggml_tensor * inpL;
|
5179
5717
|
|
5180
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5181
|
-
cb(inpL, "inp_embd", -1);
|
5718
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5182
5719
|
|
5183
5720
|
// inp_pos - contains the positions
|
5184
|
-
struct ggml_tensor * inp_pos =
|
5185
|
-
cb(inp_pos, "inp_pos", -1);
|
5721
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5186
5722
|
|
5187
5723
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5188
|
-
struct ggml_tensor * KQ_mask =
|
5189
|
-
cb(KQ_mask, "KQ_mask", -1);
|
5724
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5190
5725
|
|
5191
5726
|
for (int il = 0; il < n_layer; ++il) {
|
5192
5727
|
struct ggml_tensor * inpSA = inpL;
|
@@ -5238,7 +5773,6 @@ struct llm_build_context {
|
|
5238
5773
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5239
5774
|
model.layers[il].wo, model.layers[il].bo,
|
5240
5775
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5241
|
-
cb(cur, "kqv_out", il);
|
5242
5776
|
}
|
5243
5777
|
|
5244
5778
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -5356,20 +5890,16 @@ struct llm_build_context {
|
|
5356
5890
|
struct ggml_tensor * cur;
|
5357
5891
|
struct ggml_tensor * inpL;
|
5358
5892
|
|
5359
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5360
|
-
cb(inpL, "inp_embd", -1);
|
5893
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5361
5894
|
|
5362
5895
|
// inp_pos - contains the positions
|
5363
|
-
struct ggml_tensor * inp_pos =
|
5364
|
-
cb(inp_pos, "inp_pos", -1);
|
5896
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5365
5897
|
|
5366
5898
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5367
|
-
struct ggml_tensor * KQ_mask =
|
5368
|
-
cb(KQ_mask, "KQ_mask", -1);
|
5899
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5369
5900
|
|
5370
5901
|
// positions of the tokens in the KV cache
|
5371
|
-
struct ggml_tensor * KQ_pos =
|
5372
|
-
cb(KQ_pos, "KQ_pos", -1);
|
5902
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
5373
5903
|
|
5374
5904
|
for (int il = 0; il < n_layer; ++il) {
|
5375
5905
|
struct ggml_tensor * inpSA = inpL;
|
@@ -5417,7 +5947,6 @@ struct llm_build_context {
|
|
5417
5947
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5418
5948
|
model.layers[il].wo, NULL,
|
5419
5949
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5420
|
-
cb(cur, "kqv_out", il);
|
5421
5950
|
}
|
5422
5951
|
|
5423
5952
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -5473,16 +6002,13 @@ struct llm_build_context {
|
|
5473
6002
|
struct ggml_tensor * cur;
|
5474
6003
|
struct ggml_tensor * inpL;
|
5475
6004
|
|
5476
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5477
|
-
cb(inpL, "inp_embd", -1);
|
6005
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5478
6006
|
|
5479
6007
|
// inp_pos - contains the positions
|
5480
|
-
struct ggml_tensor * inp_pos =
|
5481
|
-
cb(inp_pos, "inp_pos", -1);
|
6008
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5482
6009
|
|
5483
6010
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5484
|
-
struct ggml_tensor * KQ_mask =
|
5485
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6011
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5486
6012
|
|
5487
6013
|
for (int il = 0; il < n_layer; ++il) {
|
5488
6014
|
struct ggml_tensor * attn_norm;
|
@@ -5536,7 +6062,6 @@ struct llm_build_context {
|
|
5536
6062
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5537
6063
|
model.layers[il].wo, NULL,
|
5538
6064
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5539
|
-
cb(cur, "kqv_out", il);
|
5540
6065
|
}
|
5541
6066
|
|
5542
6067
|
struct ggml_tensor * ffn_inp = cur;
|
@@ -5587,21 +6112,17 @@ struct llm_build_context {
|
|
5587
6112
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5588
6113
|
|
5589
6114
|
struct ggml_tensor * cur;
|
5590
|
-
struct ggml_tensor * pos;
|
5591
6115
|
struct ggml_tensor * inpL;
|
5592
6116
|
|
5593
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5594
|
-
cb(inpL, "inp_embd", -1);
|
6117
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5595
6118
|
|
5596
6119
|
// inp_pos - contains the positions
|
5597
|
-
struct ggml_tensor * inp_pos =
|
5598
|
-
cb(inp_pos, "inp_pos", -1);
|
6120
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5599
6121
|
|
5600
6122
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5601
|
-
struct ggml_tensor * KQ_mask =
|
5602
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6123
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5603
6124
|
|
5604
|
-
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
6125
|
+
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
5605
6126
|
cb(pos, "pos_embd", -1);
|
5606
6127
|
|
5607
6128
|
inpL = ggml_add(ctx0, inpL, pos);
|
@@ -5635,7 +6156,6 @@ struct llm_build_context {
|
|
5635
6156
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5636
6157
|
model.layers[il].wo, model.layers[il].bo,
|
5637
6158
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5638
|
-
cb(cur, "kqv_out", il);
|
5639
6159
|
}
|
5640
6160
|
|
5641
6161
|
// add the input
|
@@ -5687,16 +6207,13 @@ struct llm_build_context {
|
|
5687
6207
|
struct ggml_tensor * cur;
|
5688
6208
|
struct ggml_tensor * inpL;
|
5689
6209
|
|
5690
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5691
|
-
cb(inpL, "inp_embd", -1);
|
6210
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5692
6211
|
|
5693
6212
|
// inp_pos - contains the positions
|
5694
|
-
struct ggml_tensor * inp_pos =
|
5695
|
-
cb(inp_pos, "inp_pos", -1);
|
6213
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5696
6214
|
|
5697
6215
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5698
|
-
struct ggml_tensor * KQ_mask =
|
5699
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6216
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5700
6217
|
|
5701
6218
|
for (int il = 0; il < n_layer; ++il) {
|
5702
6219
|
struct ggml_tensor * residual = inpL;
|
@@ -5836,7 +6353,6 @@ struct llm_build_context {
|
|
5836
6353
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5837
6354
|
model.layers[il].wo, model.layers[il].bo,
|
5838
6355
|
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5839
|
-
cb(cur, "kqv_out", il);
|
5840
6356
|
}
|
5841
6357
|
|
5842
6358
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
@@ -5890,16 +6406,13 @@ struct llm_build_context {
|
|
5890
6406
|
struct ggml_tensor * cur;
|
5891
6407
|
struct ggml_tensor * inpL;
|
5892
6408
|
|
5893
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5894
|
-
cb(inpL, "inp_embd", -1);
|
6409
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5895
6410
|
|
5896
6411
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5897
|
-
struct ggml_tensor * KQ_mask =
|
5898
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6412
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5899
6413
|
|
5900
6414
|
// positions of the tokens in the KV cache
|
5901
|
-
struct ggml_tensor * KQ_pos =
|
5902
|
-
cb(KQ_pos, "KQ_pos", -1);
|
6415
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
5903
6416
|
|
5904
6417
|
for (int il = 0; il < n_layer; ++il) {
|
5905
6418
|
struct ggml_tensor * inpSA = inpL;
|
@@ -5929,7 +6442,6 @@ struct llm_build_context {
|
|
5929
6442
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5930
6443
|
model.layers[il].wo, NULL,
|
5931
6444
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5932
|
-
cb(cur, "kqv_out", il);
|
5933
6445
|
}
|
5934
6446
|
|
5935
6447
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -5979,19 +6491,18 @@ struct llm_build_context {
|
|
5979
6491
|
|
5980
6492
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5981
6493
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
6494
|
+
|
5982
6495
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5983
6496
|
|
5984
6497
|
struct ggml_tensor * cur;
|
5985
6498
|
struct ggml_tensor * inpL;
|
5986
6499
|
|
5987
|
-
|
5988
|
-
|
5989
|
-
struct ggml_tensor *
|
5990
|
-
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
5991
|
-
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
6500
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6501
|
+
struct ggml_tensor * inp_mean = build_inp_mean();
|
6502
|
+
struct ggml_tensor * inp_cls = build_inp_cls();
|
5992
6503
|
|
5993
6504
|
// construct input embeddings (token, type, position)
|
5994
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6505
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5995
6506
|
|
5996
6507
|
// token types are hardcoded to zero ("Sentence A")
|
5997
6508
|
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
@@ -6006,39 +6517,37 @@ struct llm_build_context {
|
|
6006
6517
|
cb(inpL, "inp_norm", -1);
|
6007
6518
|
|
6008
6519
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6009
|
-
struct ggml_tensor * KQ_mask =
|
6010
|
-
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
6520
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
|
6011
6521
|
|
6012
6522
|
// iterate layers
|
6013
6523
|
for (int il = 0; il < n_layer; ++il) {
|
6014
6524
|
struct ggml_tensor * cur = inpL;
|
6015
6525
|
|
6526
|
+
struct ggml_tensor * Qcur;
|
6527
|
+
struct ggml_tensor * Kcur;
|
6528
|
+
struct ggml_tensor * Vcur;
|
6529
|
+
|
6016
6530
|
// self-attention
|
6017
6531
|
if (model.arch == LLM_ARCH_BERT) {
|
6018
|
-
|
6532
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
6019
6533
|
cb(Qcur, "Qcur", il);
|
6020
6534
|
|
6021
|
-
|
6535
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
6022
6536
|
cb(Kcur, "Kcur", il);
|
6023
6537
|
|
6024
|
-
|
6538
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
6025
6539
|
cb(Vcur, "Vcur", il);
|
6026
6540
|
|
6027
|
-
|
6028
|
-
|
6029
|
-
|
6030
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6031
|
-
model.layers[il].wo, model.layers[il].bo,
|
6032
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6033
|
-
cb(cur, "kqv_out", il);
|
6541
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
6542
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
6034
6543
|
} else {
|
6035
6544
|
// compute Q and K and RoPE them
|
6036
6545
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
6037
6546
|
cb(cur, "wqkv", il);
|
6038
6547
|
|
6039
|
-
|
6040
|
-
|
6041
|
-
|
6548
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
6549
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
6550
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
6042
6551
|
|
6043
6552
|
cb(Qcur, "Qcur", il);
|
6044
6553
|
cb(Kcur, "Kcur", il);
|
@@ -6057,12 +6566,40 @@ struct llm_build_context {
|
|
6057
6566
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6058
6567
|
);
|
6059
6568
|
cb(Kcur, "Kcur", il);
|
6569
|
+
}
|
6060
6570
|
|
6061
|
-
|
6062
|
-
|
6063
|
-
|
6064
|
-
|
6571
|
+
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
6572
|
+
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
6573
|
+
|
6574
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
6575
|
+
cb(kq, "kq", il);
|
6576
|
+
|
6577
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
6578
|
+
cb(kq, "kq_soft_max_ext", il);
|
6579
|
+
|
6580
|
+
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
6581
|
+
cb(v, "v", il);
|
6582
|
+
|
6583
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
|
6584
|
+
cb(kqv, "kqv", il);
|
6585
|
+
|
6586
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
6587
|
+
cb(kqv_merged, "kqv_merged", il);
|
6588
|
+
|
6589
|
+
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
6590
|
+
cb(cur, "kqv_merged_cont", il);
|
6591
|
+
|
6592
|
+
ggml_build_forward_expand(gf, cur);
|
6593
|
+
|
6594
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
6595
|
+
if (model.layers[il].bo) {
|
6596
|
+
cb(cur, "kqv_wo", il);
|
6597
|
+
}
|
6598
|
+
|
6599
|
+
if (model.layers[il].bo) {
|
6600
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
6065
6601
|
}
|
6602
|
+
cb(cur, "kqv_out", il);
|
6066
6603
|
|
6067
6604
|
// re-add the layer input
|
6068
6605
|
cur = ggml_add(ctx0, cur, inpL);
|
@@ -6103,16 +6640,29 @@ struct llm_build_context {
|
|
6103
6640
|
|
6104
6641
|
// final output
|
6105
6642
|
cur = inpL;
|
6643
|
+
cb(cur, "result_embd", -1);
|
6106
6644
|
|
6107
6645
|
// pooling layer
|
6108
|
-
|
6109
|
-
|
6110
|
-
|
6111
|
-
|
6112
|
-
|
6113
|
-
|
6646
|
+
switch (pooling_type) {
|
6647
|
+
case LLAMA_POOLING_TYPE_NONE:
|
6648
|
+
{
|
6649
|
+
// nop
|
6650
|
+
} break;
|
6651
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
6652
|
+
{
|
6653
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6654
|
+
cb(cur, "result_embd_pooled", -1);
|
6655
|
+
} break;
|
6656
|
+
case LLAMA_POOLING_TYPE_CLS:
|
6657
|
+
{
|
6658
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6659
|
+
cb(cur, "result_embd_pooled", -1);
|
6660
|
+
} break;
|
6661
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
6662
|
+
{
|
6663
|
+
GGML_ASSERT(false && "Invalid pooling type");
|
6664
|
+
} break;
|
6114
6665
|
}
|
6115
|
-
cb(cur, "result_embd", -1);
|
6116
6666
|
|
6117
6667
|
ggml_build_forward_expand(gf, cur);
|
6118
6668
|
|
@@ -6129,16 +6679,13 @@ struct llm_build_context {
|
|
6129
6679
|
struct ggml_tensor * cur;
|
6130
6680
|
struct ggml_tensor * inpL;
|
6131
6681
|
|
6132
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6133
|
-
cb(inpL, "inp_embd", -1);
|
6682
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6134
6683
|
|
6135
6684
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6136
|
-
struct ggml_tensor * KQ_mask =
|
6137
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6685
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6138
6686
|
|
6139
6687
|
// positions of the tokens in the KV cache
|
6140
|
-
struct ggml_tensor * KQ_pos =
|
6141
|
-
cb(KQ_pos, "KQ_pos", -1);
|
6688
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6142
6689
|
|
6143
6690
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
6144
6691
|
model.tok_norm,
|
@@ -6174,7 +6721,6 @@ struct llm_build_context {
|
|
6174
6721
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6175
6722
|
model.layers[il].wo, model.layers[il].bo,
|
6176
6723
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6177
|
-
cb(cur, "kqv_out", il);
|
6178
6724
|
}
|
6179
6725
|
|
6180
6726
|
// Add the input
|
@@ -6226,16 +6772,13 @@ struct llm_build_context {
|
|
6226
6772
|
struct ggml_tensor * cur;
|
6227
6773
|
struct ggml_tensor * inpL;
|
6228
6774
|
|
6229
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6230
|
-
cb(inpL, "inp_embd", -1);
|
6775
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6231
6776
|
|
6232
6777
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6233
|
-
struct ggml_tensor * KQ_mask =
|
6234
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6778
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6235
6779
|
|
6236
6780
|
// positions of the tokens in the KV cache
|
6237
|
-
struct ggml_tensor * KQ_pos =
|
6238
|
-
cb(KQ_pos, "KQ_pos", -1);
|
6781
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6239
6782
|
|
6240
6783
|
for (int il = 0; il < n_layer; ++il) {
|
6241
6784
|
struct ggml_tensor * attn_norm;
|
@@ -6276,7 +6819,6 @@ struct llm_build_context {
|
|
6276
6819
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6277
6820
|
model.layers[il].wo, model.layers[il].bo,
|
6278
6821
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6279
|
-
cb(cur, "kqv_out", il);
|
6280
6822
|
}
|
6281
6823
|
|
6282
6824
|
// Add the input
|
@@ -6331,16 +6873,13 @@ struct llm_build_context {
|
|
6331
6873
|
struct ggml_tensor * cur;
|
6332
6874
|
struct ggml_tensor * inpL;
|
6333
6875
|
|
6334
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6335
|
-
cb(inpL, "inp_embd", -1);
|
6876
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6336
6877
|
|
6337
6878
|
// inp_pos - contains the positions
|
6338
|
-
struct ggml_tensor * inp_pos =
|
6339
|
-
cb(inp_pos, "inp_pos", -1);
|
6879
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6340
6880
|
|
6341
6881
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6342
|
-
struct ggml_tensor * KQ_mask =
|
6343
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6882
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6344
6883
|
|
6345
6884
|
for (int il = 0; il < n_layer; ++il) {
|
6346
6885
|
struct ggml_tensor * inpSA = inpL;
|
@@ -6393,7 +6932,6 @@ struct llm_build_context {
|
|
6393
6932
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6394
6933
|
model.layers[il].wo, NULL,
|
6395
6934
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6396
|
-
cb(cur, "kqv_out", il);
|
6397
6935
|
}
|
6398
6936
|
|
6399
6937
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -6449,16 +6987,13 @@ struct llm_build_context {
|
|
6449
6987
|
struct ggml_tensor * cur;
|
6450
6988
|
struct ggml_tensor * inpL;
|
6451
6989
|
|
6452
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6453
|
-
cb(inpL, "inp_embd", -1);
|
6990
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6454
6991
|
|
6455
6992
|
// inp_pos - contains the positions
|
6456
|
-
struct ggml_tensor * inp_pos =
|
6457
|
-
cb(inp_pos, "inp_pos", -1);
|
6993
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6458
6994
|
|
6459
6995
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6460
|
-
struct ggml_tensor * KQ_mask =
|
6461
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6996
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6462
6997
|
|
6463
6998
|
for (int il = 0; il < n_layer; ++il) {
|
6464
6999
|
struct ggml_tensor * inpSA = inpL;
|
@@ -6503,7 +7038,6 @@ struct llm_build_context {
|
|
6503
7038
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6504
7039
|
model.layers[il].wo, NULL,
|
6505
7040
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6506
|
-
cb(cur, "kqv_out", il);
|
6507
7041
|
}
|
6508
7042
|
|
6509
7043
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -6558,16 +7092,13 @@ struct llm_build_context {
|
|
6558
7092
|
struct ggml_tensor * cur;
|
6559
7093
|
struct ggml_tensor * inpL;
|
6560
7094
|
|
6561
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6562
|
-
cb(inpL, "inp_embd", -1);
|
7095
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6563
7096
|
|
6564
7097
|
// inp_pos - contains the positions
|
6565
|
-
struct ggml_tensor * inp_pos =
|
6566
|
-
cb(inp_pos, "inp_pos", -1);
|
7098
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6567
7099
|
|
6568
7100
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6569
|
-
struct ggml_tensor * KQ_mask =
|
6570
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7101
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6571
7102
|
|
6572
7103
|
for (int il = 0; il < n_layer; ++il) {
|
6573
7104
|
struct ggml_tensor * inpSA = inpL;
|
@@ -6619,7 +7150,6 @@ struct llm_build_context {
|
|
6619
7150
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6620
7151
|
model.layers[il].wo, model.layers[il].bo,
|
6621
7152
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6622
|
-
cb(cur, "kqv_out", il);
|
6623
7153
|
}
|
6624
7154
|
|
6625
7155
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -6674,16 +7204,13 @@ struct llm_build_context {
|
|
6674
7204
|
struct ggml_tensor * ffn_output;
|
6675
7205
|
struct ggml_tensor * inpL;
|
6676
7206
|
|
6677
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6678
|
-
cb(inpL, "inp_embd", -1);
|
7207
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6679
7208
|
|
6680
7209
|
// inp_pos - contains the positions
|
6681
|
-
struct ggml_tensor * inp_pos =
|
6682
|
-
cb(inp_pos, "inp_pos", -1);
|
7210
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6683
7211
|
|
6684
7212
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6685
|
-
struct ggml_tensor * KQ_mask =
|
6686
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7213
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6687
7214
|
|
6688
7215
|
for (int il = 0; il < n_layer; ++il) {
|
6689
7216
|
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
@@ -6741,7 +7268,6 @@ struct llm_build_context {
|
|
6741
7268
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6742
7269
|
model.layers[il].wo, model.layers[il].bo,
|
6743
7270
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6744
|
-
cb(cur, "kqv_out", il);
|
6745
7271
|
}
|
6746
7272
|
|
6747
7273
|
// FF
|
@@ -6791,16 +7317,13 @@ struct llm_build_context {
|
|
6791
7317
|
struct ggml_tensor * cur;
|
6792
7318
|
struct ggml_tensor * inpL;
|
6793
7319
|
|
6794
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6795
|
-
cb(inpL, "inp_embd", -1);
|
7320
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6796
7321
|
|
6797
7322
|
// inp_pos - contains the positions
|
6798
|
-
struct ggml_tensor * inp_pos =
|
6799
|
-
cb(inp_pos, "inp_pos", -1);
|
7323
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6800
7324
|
|
6801
7325
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6802
|
-
struct ggml_tensor * KQ_mask =
|
6803
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7326
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6804
7327
|
|
6805
7328
|
for (int il = 0; il < n_layer; ++il) {
|
6806
7329
|
|
@@ -6839,7 +7362,6 @@ struct llm_build_context {
|
|
6839
7362
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6840
7363
|
model.layers[il].wo, NULL,
|
6841
7364
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6842
|
-
cb(cur, "kqv_out", il);
|
6843
7365
|
}
|
6844
7366
|
struct ggml_tensor * sa_out = cur;
|
6845
7367
|
|
@@ -6893,16 +7415,13 @@ struct llm_build_context {
|
|
6893
7415
|
struct ggml_tensor * pos;
|
6894
7416
|
struct ggml_tensor * inpL;
|
6895
7417
|
|
6896
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6897
|
-
cb(inpL, "inp_embd", -1);
|
7418
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6898
7419
|
|
6899
7420
|
// inp_pos - contains the positions
|
6900
|
-
struct ggml_tensor * inp_pos =
|
6901
|
-
cb(inp_pos, "inp_pos", -1);
|
7421
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6902
7422
|
|
6903
7423
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6904
|
-
struct ggml_tensor * KQ_mask =
|
6905
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7424
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6906
7425
|
|
6907
7426
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
6908
7427
|
cb(pos, "pos_embd", -1);
|
@@ -6938,7 +7457,6 @@ struct llm_build_context {
|
|
6938
7457
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6939
7458
|
model.layers[il].wo, model.layers[il].bo,
|
6940
7459
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6941
|
-
cb(cur, "kqv_out", il);
|
6942
7460
|
}
|
6943
7461
|
|
6944
7462
|
// add the input
|
@@ -6991,16 +7509,13 @@ struct llm_build_context {
|
|
6991
7509
|
struct ggml_tensor * cur;
|
6992
7510
|
struct ggml_tensor * inpL;
|
6993
7511
|
|
6994
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6995
|
-
cb(inpL, "inp_embd", -1);
|
7512
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6996
7513
|
|
6997
7514
|
// inp_pos - contains the positions
|
6998
|
-
struct ggml_tensor * inp_pos =
|
6999
|
-
cb(inp_pos, "inp_pos", -1);
|
7515
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7000
7516
|
|
7001
7517
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7002
|
-
struct ggml_tensor * KQ_mask =
|
7003
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7518
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7004
7519
|
|
7005
7520
|
for (int il = 0; il < n_layer; ++il) {
|
7006
7521
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
@@ -7042,7 +7557,6 @@ struct llm_build_context {
|
|
7042
7557
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7043
7558
|
model.layers[il].wo, model.layers[il].bo,
|
7044
7559
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7045
|
-
cb(cur, "kqv_out", il);
|
7046
7560
|
}
|
7047
7561
|
|
7048
7562
|
// add the input
|
@@ -7094,16 +7608,13 @@ struct llm_build_context {
|
|
7094
7608
|
struct ggml_tensor * cur;
|
7095
7609
|
struct ggml_tensor * inpL;
|
7096
7610
|
|
7097
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
7098
|
-
cb(inpL, "inp_embd", -1);
|
7611
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7099
7612
|
|
7100
7613
|
// inp_pos - contains the positions
|
7101
|
-
struct ggml_tensor * inp_pos =
|
7102
|
-
cb(inp_pos, "inp_pos", -1);
|
7614
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7103
7615
|
|
7104
7616
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7105
|
-
struct ggml_tensor * KQ_mask =
|
7106
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7617
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7107
7618
|
|
7108
7619
|
for (int il = 0; il < n_layer; ++il) {
|
7109
7620
|
struct ggml_tensor * inpSA = inpL;
|
@@ -7155,7 +7666,6 @@ struct llm_build_context {
|
|
7155
7666
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7156
7667
|
model.layers[il].wo, NULL,
|
7157
7668
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7158
|
-
cb(cur, "kqv_out", il);
|
7159
7669
|
}
|
7160
7670
|
|
7161
7671
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -7208,16 +7718,13 @@ struct llm_build_context {
|
|
7208
7718
|
struct ggml_tensor * cur;
|
7209
7719
|
struct ggml_tensor * inpL;
|
7210
7720
|
|
7211
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
7212
|
-
cb(inpL, "inp_embd", -1);
|
7721
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7213
7722
|
|
7214
7723
|
// inp_pos - contains the positions
|
7215
|
-
struct ggml_tensor * inp_pos =
|
7216
|
-
cb(inp_pos, "inp_pos", -1);
|
7724
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7217
7725
|
|
7218
7726
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7219
|
-
struct ggml_tensor * KQ_mask =
|
7220
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7727
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7221
7728
|
|
7222
7729
|
for (int il = 0; il < n_layer; ++il) {
|
7223
7730
|
struct ggml_tensor * inpSA = inpL;
|
@@ -7269,7 +7776,6 @@ struct llm_build_context {
|
|
7269
7776
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7270
7777
|
model.layers[il].wo, model.layers[il].bo,
|
7271
7778
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7272
|
-
cb(cur, "kqv_out", il);
|
7273
7779
|
}
|
7274
7780
|
|
7275
7781
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -7331,20 +7837,17 @@ struct llm_build_context {
|
|
7331
7837
|
struct ggml_tensor * cur;
|
7332
7838
|
struct ggml_tensor * inpL;
|
7333
7839
|
|
7334
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
7335
|
-
cb(inpL, "inp_embd", -1);
|
7840
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7336
7841
|
|
7337
7842
|
// scale the input embeddings
|
7338
7843
|
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
7339
7844
|
cb(inpL, "inp_scaled", -1);
|
7340
7845
|
|
7341
7846
|
// inp_pos - contains the positions
|
7342
|
-
struct ggml_tensor * inp_pos =
|
7343
|
-
cb(inp_pos, "inp_pos", -1);
|
7847
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7344
7848
|
|
7345
7849
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7346
|
-
struct ggml_tensor * KQ_mask =
|
7347
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7850
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7348
7851
|
|
7349
7852
|
for (int il = 0; il < n_layer; ++il) {
|
7350
7853
|
struct ggml_tensor * inpSA = inpL;
|
@@ -7396,7 +7899,6 @@ struct llm_build_context {
|
|
7396
7899
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7397
7900
|
model.layers[il].wo, model.layers[il].bo,
|
7398
7901
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7399
|
-
cb(cur, "kqv_out", il);
|
7400
7902
|
}
|
7401
7903
|
|
7402
7904
|
// scale_res - scale the hidden states for residual connection
|
@@ -7463,22 +7965,18 @@ struct llm_build_context {
|
|
7463
7965
|
struct ggml_tensor * cur;
|
7464
7966
|
struct ggml_tensor * inpL;
|
7465
7967
|
|
7466
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
7467
|
-
cb(inpL, "inp_embd", -1);
|
7968
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7468
7969
|
|
7469
7970
|
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
7470
7971
|
cb(inpL, "inp_scaled", -1);
|
7471
7972
|
|
7472
7973
|
// inp_pos - contains the positions
|
7473
|
-
struct ggml_tensor * inp_pos =
|
7474
|
-
cb(inp_pos, "inp_pos", -1);
|
7974
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7475
7975
|
|
7476
7976
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7477
|
-
struct ggml_tensor * KQ_mask =
|
7478
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7977
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7479
7978
|
|
7480
7979
|
for (int il = 0; il < n_layer; ++il) {
|
7481
|
-
|
7482
7980
|
// norm
|
7483
7981
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
7484
7982
|
model.layers[il].attn_norm, NULL,
|
@@ -7515,7 +8013,6 @@ struct llm_build_context {
|
|
7515
8013
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7516
8014
|
model.layers[il].wo, NULL,
|
7517
8015
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7518
|
-
cb(cur, "kqv_out", il);
|
7519
8016
|
}
|
7520
8017
|
|
7521
8018
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
@@ -7559,6 +8056,255 @@ struct llm_build_context {
|
|
7559
8056
|
|
7560
8057
|
return gf;
|
7561
8058
|
}
|
8059
|
+
|
8060
|
+
struct ggml_cgraph * build_starcoder2() {
|
8061
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8062
|
+
|
8063
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8064
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8065
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
8066
|
+
|
8067
|
+
struct ggml_tensor * cur;
|
8068
|
+
struct ggml_tensor * inpL;
|
8069
|
+
|
8070
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8071
|
+
|
8072
|
+
// inp_pos - contains the positions
|
8073
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8074
|
+
|
8075
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8076
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8077
|
+
|
8078
|
+
for (int il = 0; il < n_layer; ++il) {
|
8079
|
+
struct ggml_tensor * inpSA = inpL;
|
8080
|
+
|
8081
|
+
// norm
|
8082
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8083
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
8084
|
+
LLM_NORM, cb, il);
|
8085
|
+
cb(cur, "attn_norm", il);
|
8086
|
+
|
8087
|
+
// self-attention
|
8088
|
+
{
|
8089
|
+
// compute Q and K and RoPE them
|
8090
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8091
|
+
cb(Qcur, "Qcur", il);
|
8092
|
+
if (model.layers[il].bq) {
|
8093
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8094
|
+
cb(Qcur, "Qcur", il);
|
8095
|
+
}
|
8096
|
+
|
8097
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8098
|
+
cb(Kcur, "Kcur", il);
|
8099
|
+
if (model.layers[il].bk) {
|
8100
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8101
|
+
cb(Kcur, "Kcur", il);
|
8102
|
+
}
|
8103
|
+
|
8104
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8105
|
+
cb(Vcur, "Vcur", il);
|
8106
|
+
if (model.layers[il].bv) {
|
8107
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8108
|
+
cb(Vcur, "Vcur", il);
|
8109
|
+
}
|
8110
|
+
|
8111
|
+
Qcur = ggml_rope_custom(
|
8112
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8113
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8114
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8115
|
+
);
|
8116
|
+
cb(Qcur, "Qcur", il);
|
8117
|
+
|
8118
|
+
Kcur = ggml_rope_custom(
|
8119
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8120
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8121
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8122
|
+
);
|
8123
|
+
cb(Kcur, "Kcur", il);
|
8124
|
+
|
8125
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8126
|
+
model.layers[il].wo, model.layers[il].bo,
|
8127
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8128
|
+
cb(cur, "kqv_out", il);
|
8129
|
+
}
|
8130
|
+
|
8131
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8132
|
+
cb(ffn_inp, "ffn_inp", il);
|
8133
|
+
|
8134
|
+
// feed-forward network
|
8135
|
+
|
8136
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8137
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
8138
|
+
LLM_NORM, cb, il);
|
8139
|
+
cb(cur, "ffn_norm", il);
|
8140
|
+
|
8141
|
+
cur = llm_build_ffn(ctx0, cur,
|
8142
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
8143
|
+
NULL, NULL,
|
8144
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8145
|
+
NULL,
|
8146
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8147
|
+
cb(cur, "ffn_out", il);
|
8148
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
8149
|
+
cb(cur, "l_out", il);
|
8150
|
+
|
8151
|
+
// input for next layer
|
8152
|
+
inpL = cur;
|
8153
|
+
}
|
8154
|
+
|
8155
|
+
cur = inpL;
|
8156
|
+
|
8157
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
8158
|
+
model.output_norm, model.output_norm_b,
|
8159
|
+
LLM_NORM, cb, -1);
|
8160
|
+
cb(cur, "result_norm", -1);
|
8161
|
+
|
8162
|
+
// lm_head
|
8163
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8164
|
+
cb(cur, "result_output", -1);
|
8165
|
+
|
8166
|
+
ggml_build_forward_expand(gf, cur);
|
8167
|
+
|
8168
|
+
return gf;
|
8169
|
+
}
|
8170
|
+
|
8171
|
+
struct ggml_cgraph * build_mamba() {
|
8172
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8173
|
+
|
8174
|
+
const int64_t d_model = n_embd;
|
8175
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
8176
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
8177
|
+
GGML_ASSERT(2 * d_model == d_inner);
|
8178
|
+
const int64_t d_state = hparams.ssm_d_state;
|
8179
|
+
const int64_t dt_rank = hparams.ssm_dt_rank;
|
8180
|
+
|
8181
|
+
struct ggml_tensor * cur;
|
8182
|
+
struct ggml_tensor * inpL;
|
8183
|
+
|
8184
|
+
// {n_embd, n_tokens}
|
8185
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8186
|
+
|
8187
|
+
struct ggml_tensor * state_mask = build_inp_s_mask();
|
8188
|
+
struct ggml_tensor * state_seq = build_inp_s_seq();
|
8189
|
+
|
8190
|
+
for (int il = 0; il < n_layer; ++il) {
|
8191
|
+
// (ab)using the KV cache to store the states
|
8192
|
+
struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
|
8193
|
+
struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
|
8194
|
+
|
8195
|
+
// clear states of sequences which are starting at the beginning of this batch
|
8196
|
+
{
|
8197
|
+
conv_states = ggml_mul(ctx0,
|
8198
|
+
ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]),
|
8199
|
+
state_mask);
|
8200
|
+
ssm_states = ggml_mul(ctx0,
|
8201
|
+
ggml_view_2d(ctx0, ssm_states, ssm_states->ne[0], n_kv, ssm_states->nb[1], kv_head*ssm_states->nb[1]),
|
8202
|
+
state_mask);
|
8203
|
+
}
|
8204
|
+
|
8205
|
+
conv_states = ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv);
|
8206
|
+
ssm_states = ggml_reshape_3d(ctx0, ssm_states, d_state, d_inner, n_kv);
|
8207
|
+
|
8208
|
+
// norm
|
8209
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8210
|
+
model.layers[il].attn_norm, NULL,
|
8211
|
+
LLM_NORM_RMS, cb, il);
|
8212
|
+
cb(cur, "attn_norm", il);
|
8213
|
+
|
8214
|
+
// {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
|
8215
|
+
struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
|
8216
|
+
// split the above in two
|
8217
|
+
// => {d_inner, n_tokens}
|
8218
|
+
struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
|
8219
|
+
struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
|
8220
|
+
|
8221
|
+
// conv
|
8222
|
+
{
|
8223
|
+
// Custom operator which is needed only to ease simultaneous sequence processing.
|
8224
|
+
// For a single sequence, the equivalent is to concatenate the columns of conv_states and x,
|
8225
|
+
// then make a self-overlapping view of that over d_conv columns at each stride in the 3rd dimension,
|
8226
|
+
// then element-wise multiply that with the conv1d weigth,
|
8227
|
+
// then sum the elements of each row,
|
8228
|
+
// (the last two steps are a dot product over rows (also doable with mul_mat))
|
8229
|
+
// then permute away the ne[0] dimension,
|
8230
|
+
// and then you're left with the resulting x tensor.
|
8231
|
+
// The new conv_states is the last (d_conv - 1) columns
|
8232
|
+
// of the last 3rd dimensional "layer" of the self-overlapping view.
|
8233
|
+
// For simultaneous sequences, it's more complicated.
|
8234
|
+
struct ggml_tensor * x_conv = ggml_ssm_conv(ctx0, conv_states, x, model.layers[il].ssm_conv1d, state_seq);
|
8235
|
+
|
8236
|
+
// store last (d_conv - 1) columns of the conv_state part of x_conv back into the KV cache
|
8237
|
+
ggml_build_forward_expand(gf,
|
8238
|
+
ggml_cpy(ctx0,
|
8239
|
+
ggml_view_2d(ctx0, x_conv, d_conv - 1, d_inner*n_kv, d_conv*ggml_element_size(x_conv), (1+d_inner*n_tokens)*ggml_element_size(x_conv)),
|
8240
|
+
ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(x_conv))));
|
8241
|
+
|
8242
|
+
// extract x from x_conv
|
8243
|
+
x = ggml_view_2d(ctx0, x_conv, d_inner, n_tokens, d_inner*ggml_element_size(x_conv), 0);
|
8244
|
+
|
8245
|
+
// bias
|
8246
|
+
x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
|
8247
|
+
|
8248
|
+
x = ggml_silu(ctx0, x);
|
8249
|
+
}
|
8250
|
+
|
8251
|
+
// ssm
|
8252
|
+
{
|
8253
|
+
// {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
|
8254
|
+
struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
|
8255
|
+
// split
|
8256
|
+
struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
|
8257
|
+
struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
|
8258
|
+
struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
|
8259
|
+
|
8260
|
+
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
|
8261
|
+
dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
|
8262
|
+
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
|
8263
|
+
|
8264
|
+
// Custom operator to optimize the parallel associative scan
|
8265
|
+
// as described in the Annex D of the Mamba paper.
|
8266
|
+
// => {d_inner, n_tokens} and {d_state, d_inner, n_kv} combined,
|
8267
|
+
// because only a single tensor can be returned.
|
8268
|
+
struct ggml_tensor * y_ssm_states = ggml_ssm_scan(ctx0, ssm_states, x, dt, model.layers[il].ssm_a, B, C, state_seq);
|
8269
|
+
|
8270
|
+
// store last states (the second part of y_ssm_states)
|
8271
|
+
ggml_build_forward_expand(gf,
|
8272
|
+
ggml_cpy(ctx0,
|
8273
|
+
ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*ggml_element_size(y_ssm_states)),
|
8274
|
+
ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_head*d_state*d_inner*ggml_element_size(ssm_states))));
|
8275
|
+
|
8276
|
+
struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
|
8277
|
+
|
8278
|
+
// {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
|
8279
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
8280
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
|
8281
|
+
|
8282
|
+
// {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
|
8283
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
|
8284
|
+
}
|
8285
|
+
|
8286
|
+
// residual
|
8287
|
+
cur = ggml_add(ctx0, cur, inpL);
|
8288
|
+
cb(cur, "l_out", il);
|
8289
|
+
|
8290
|
+
// input for next layer
|
8291
|
+
inpL = cur;
|
8292
|
+
}
|
8293
|
+
|
8294
|
+
// final rmsnorm
|
8295
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8296
|
+
model.output_norm, NULL,
|
8297
|
+
LLM_NORM_RMS, cb, -1);
|
8298
|
+
cb(cur, "result_norm", -1);
|
8299
|
+
|
8300
|
+
// lm_head
|
8301
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8302
|
+
cb(cur, "result_output", -1);
|
8303
|
+
|
8304
|
+
ggml_build_forward_expand(gf, cur);
|
8305
|
+
|
8306
|
+
return gf;
|
8307
|
+
}
|
7562
8308
|
};
|
7563
8309
|
|
7564
8310
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -7595,6 +8341,23 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
|
7595
8341
|
return result;
|
7596
8342
|
}
|
7597
8343
|
|
8344
|
+
static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
|
8345
|
+
llama_batch dummy;
|
8346
|
+
dummy.n_tokens = 0;
|
8347
|
+
|
8348
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
8349
|
+
|
8350
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
8351
|
+
|
8352
|
+
llm.init();
|
8353
|
+
|
8354
|
+
struct ggml_cgraph * result = llm.build_s_copy();
|
8355
|
+
|
8356
|
+
llm.free();
|
8357
|
+
|
8358
|
+
return result;
|
8359
|
+
}
|
8360
|
+
|
7598
8361
|
static struct ggml_cgraph * llama_build_graph(
|
7599
8362
|
llama_context & lctx,
|
7600
8363
|
const llama_batch & batch,
|
@@ -7612,7 +8375,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7612
8375
|
if (!lctx.cparams.offload_kqv) {
|
7613
8376
|
if (strcmp(name, "kqv_merged_cont") == 0) {
|
7614
8377
|
// all nodes between the KV store and the attention output are run on the CPU
|
7615
|
-
|
8378
|
+
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu);
|
8379
|
+
}
|
8380
|
+
}
|
8381
|
+
|
8382
|
+
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
8383
|
+
// to fix this, we assign the norm layer manually to the backend of its layer
|
8384
|
+
if (il != -1 && strcmp(name, "norm") == 0) {
|
8385
|
+
for (auto * backend : lctx.backends) {
|
8386
|
+
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
8387
|
+
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
8388
|
+
break;
|
8389
|
+
}
|
7616
8390
|
}
|
7617
8391
|
}
|
7618
8392
|
};
|
@@ -7705,6 +8479,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7705
8479
|
{
|
7706
8480
|
result = llm.build_gemma();
|
7707
8481
|
} break;
|
8482
|
+
case LLM_ARCH_STARCODER2:
|
8483
|
+
{
|
8484
|
+
result = llm.build_starcoder2();
|
8485
|
+
} break;
|
8486
|
+
case LLM_ARCH_MAMBA:
|
8487
|
+
{
|
8488
|
+
result = llm.build_mamba();
|
8489
|
+
} break;
|
7708
8490
|
default:
|
7709
8491
|
GGML_ASSERT(false);
|
7710
8492
|
}
|
@@ -7715,19 +8497,29 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7715
8497
|
}
|
7716
8498
|
|
7717
8499
|
static void llama_set_k_shift(llama_context & lctx) {
|
7718
|
-
const
|
7719
|
-
|
7720
|
-
const int64_t n_ctx = cparams.n_ctx;
|
8500
|
+
const int64_t kv_size = lctx.kv_self.size;
|
7721
8501
|
|
7722
8502
|
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7723
8503
|
|
7724
8504
|
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7725
8505
|
|
7726
|
-
for (int i = 0; i <
|
8506
|
+
for (int i = 0; i < kv_size; ++i) {
|
7727
8507
|
data[i] = lctx.kv_self.cells[i].delta;
|
7728
8508
|
}
|
7729
8509
|
}
|
7730
8510
|
|
8511
|
+
static void llama_set_s_copy(llama_context & lctx) {
|
8512
|
+
const int64_t kv_size = lctx.kv_self.size;
|
8513
|
+
|
8514
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
|
8515
|
+
|
8516
|
+
int32_t * data = (int32_t *) lctx.inp_s_copy->data;
|
8517
|
+
|
8518
|
+
for (int i = 0; i < kv_size; ++i) {
|
8519
|
+
data[i] = lctx.kv_self.cells[i].src;
|
8520
|
+
}
|
8521
|
+
}
|
8522
|
+
|
7731
8523
|
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7732
8524
|
//
|
7733
8525
|
// set input data
|
@@ -7750,34 +8542,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7750
8542
|
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7751
8543
|
}
|
7752
8544
|
|
7753
|
-
if (batch.pos) {
|
8545
|
+
if (batch.pos && lctx.inp_pos) {
|
7754
8546
|
const int64_t n_tokens = batch.n_tokens;
|
7755
8547
|
|
7756
8548
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7757
8549
|
}
|
7758
8550
|
|
7759
|
-
|
7760
|
-
|
7761
|
-
|
8551
|
+
GGML_ASSERT(
|
8552
|
+
(hparams.causal_attn || !cparams.causal_attn) &&
|
8553
|
+
"non-causal attention with generative models is not supported"
|
8554
|
+
);
|
7762
8555
|
|
7763
|
-
|
8556
|
+
if (lctx.inp_KQ_mask) {
|
8557
|
+
// NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
|
8558
|
+
if (cparams.causal_attn) {
|
8559
|
+
const int64_t n_kv = kv_self.n;
|
8560
|
+
const int64_t n_tokens = batch.n_tokens;
|
7764
8561
|
|
7765
|
-
|
8562
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7766
8563
|
|
7767
|
-
|
7768
|
-
for (int j = 0; j < n_tokens; ++j) {
|
7769
|
-
const llama_pos pos = batch.pos[j];
|
7770
|
-
const llama_seq_id seq_id = batch.seq_id[j][0];
|
8564
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
7771
8565
|
|
7772
|
-
|
7773
|
-
|
7774
|
-
|
7775
|
-
|
7776
|
-
|
7777
|
-
|
7778
|
-
|
8566
|
+
// For causal attention, use only the previous KV cells
|
8567
|
+
// of the correct sequence for each token of the batch.
|
8568
|
+
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
8569
|
+
for (int h = 0; h < 1; ++h) {
|
8570
|
+
for (int j = 0; j < n_tokens; ++j) {
|
8571
|
+
const llama_pos pos = batch.pos[j];
|
8572
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
8573
|
+
|
8574
|
+
for (int i = 0; i < n_kv; ++i) {
|
8575
|
+
float f;
|
8576
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
8577
|
+
f = -INFINITY;
|
8578
|
+
} else {
|
8579
|
+
f = 0.0f;
|
8580
|
+
}
|
8581
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
8582
|
+
}
|
8583
|
+
}
|
8584
|
+
}
|
8585
|
+
} else {
|
8586
|
+
// when using kv cache, the mask needs to match the kv cache size
|
8587
|
+
const int64_t n_tokens = batch.n_tokens;
|
8588
|
+
const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens;
|
8589
|
+
|
8590
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
8591
|
+
|
8592
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
8593
|
+
|
8594
|
+
for (int h = 0; h < 1; ++h) {
|
8595
|
+
for (int j = 0; j < n_tokens; ++j) {
|
8596
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
8597
|
+
|
8598
|
+
for (int i = 0; i < n_tokens; ++i) {
|
8599
|
+
float f = -INFINITY;
|
8600
|
+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
8601
|
+
if (batch.seq_id[i][s] == seq_id) {
|
8602
|
+
f = 0.0f;
|
8603
|
+
break;
|
8604
|
+
}
|
8605
|
+
}
|
8606
|
+
|
8607
|
+
data[h*(n_tokens*n_tokens) + j*n_stride + i] = f;
|
8608
|
+
}
|
8609
|
+
|
8610
|
+
for (int i = n_tokens; i < n_stride; ++i) {
|
8611
|
+
data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY;
|
7779
8612
|
}
|
7780
|
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7781
8613
|
}
|
7782
8614
|
}
|
7783
8615
|
}
|
@@ -7786,7 +8618,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7786
8618
|
if (hparams.need_kq_pos) {
|
7787
8619
|
const int64_t n_kv = kv_self.n;
|
7788
8620
|
|
7789
|
-
|
8621
|
+
GGML_ASSERT(lctx.inp_KQ_pos);
|
8622
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
7790
8623
|
|
7791
8624
|
float * data = (float *) lctx.inp_KQ_pos->data;
|
7792
8625
|
|
@@ -7795,17 +8628,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7795
8628
|
}
|
7796
8629
|
}
|
7797
8630
|
|
7798
|
-
if (cparams.
|
8631
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
7799
8632
|
const int64_t n_tokens = batch.n_tokens;
|
7800
8633
|
|
8634
|
+
GGML_ASSERT(lctx.inp_mean);
|
7801
8635
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
7802
|
-
float * data = (float *) lctx.inp_mean->data;
|
7803
8636
|
|
8637
|
+
float * data = (float *) lctx.inp_mean->data;
|
7804
8638
|
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
7805
8639
|
|
7806
8640
|
std::vector<uint64_t> sum(n_tokens, 0);
|
7807
8641
|
for (int i = 0; i < n_tokens; ++i) {
|
7808
8642
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
8643
|
+
|
8644
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
8645
|
+
|
7809
8646
|
sum[seq_id] += 1;
|
7810
8647
|
}
|
7811
8648
|
|
@@ -7823,20 +8660,73 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7823
8660
|
}
|
7824
8661
|
}
|
7825
8662
|
|
7826
|
-
if (cparams.
|
8663
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
7827
8664
|
const int64_t n_tokens = batch.n_tokens;
|
7828
8665
|
|
8666
|
+
GGML_ASSERT(lctx.inp_cls);
|
7829
8667
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
8668
|
+
|
7830
8669
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
8670
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
7831
8671
|
|
7832
8672
|
for (int i = 0; i < n_tokens; ++i) {
|
7833
8673
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7834
|
-
const llama_pos
|
8674
|
+
const llama_pos pos = batch.pos[i];
|
8675
|
+
|
8676
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
8677
|
+
|
7835
8678
|
if (pos == 0) {
|
7836
8679
|
data[seq_id] = i;
|
7837
8680
|
}
|
7838
8681
|
}
|
7839
8682
|
}
|
8683
|
+
|
8684
|
+
if (kv_self.recurrent) {
|
8685
|
+
const int64_t n_kv = kv_self.n;
|
8686
|
+
|
8687
|
+
if (lctx.inp_s_mask) {
|
8688
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
|
8689
|
+
float * data = (float *) lctx.inp_s_mask->data;
|
8690
|
+
|
8691
|
+
// states which are not affected by the current batch are left untouched
|
8692
|
+
for (int i = 0; i < n_kv; ++i) {
|
8693
|
+
llama_seq_id seq_id = i + lctx.kv_self.head;
|
8694
|
+
llama_kv_cell & kv_cell = lctx.kv_self.cells[seq_id];
|
8695
|
+
bool has_self_seq = kv_cell.has_seq_id(seq_id);
|
8696
|
+
|
8697
|
+
data[i] = (float) has_self_seq;
|
8698
|
+
|
8699
|
+
// ensure current sequences will be kept
|
8700
|
+
if (!has_self_seq && kv_cell.pos >= 0) {
|
8701
|
+
kv_cell.seq_id.insert(seq_id);
|
8702
|
+
}
|
8703
|
+
}
|
8704
|
+
}
|
8705
|
+
// For Mamba (and other recurrent architectures),
|
8706
|
+
// update the correct state(s)/sequence(s) for each token of the batch.
|
8707
|
+
// Like with the KQ_mask, if a token in the batch has multiple sequences,
|
8708
|
+
// they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv).
|
8709
|
+
if (lctx.inp_s_seq) {
|
8710
|
+
const int64_t n_tokens = batch.n_tokens;
|
8711
|
+
|
8712
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer));
|
8713
|
+
int32_t * data = (int32_t *) lctx.inp_s_seq->data;
|
8714
|
+
|
8715
|
+
for (int j = 0; j < n_tokens; ++j) {
|
8716
|
+
const int32_t n_seq = batch.n_seq_id[j];
|
8717
|
+
GGML_ASSERT(0 < n_seq); // a token should be part of at least 1 sequence
|
8718
|
+
|
8719
|
+
for (int i = 0; i < n_kv; ++i) {
|
8720
|
+
if (i < n_seq) {
|
8721
|
+
// for this type of model, the head is the minimum seq_id of the batch
|
8722
|
+
data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head;
|
8723
|
+
} else {
|
8724
|
+
data[j*n_kv + i] = -1;
|
8725
|
+
}
|
8726
|
+
}
|
8727
|
+
}
|
8728
|
+
}
|
8729
|
+
}
|
7840
8730
|
}
|
7841
8731
|
|
7842
8732
|
static void llama_graph_compute(
|
@@ -7856,9 +8746,10 @@ static void llama_graph_compute(
|
|
7856
8746
|
|
7857
8747
|
if (lctx.backend_cpu != nullptr) {
|
7858
8748
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
8749
|
+
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
7859
8750
|
}
|
7860
8751
|
|
7861
|
-
|
8752
|
+
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
7862
8753
|
|
7863
8754
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
7864
8755
|
|
@@ -7878,10 +8769,11 @@ static void llama_graph_compute(
|
|
7878
8769
|
//
|
7879
8770
|
static int llama_decode_internal(
|
7880
8771
|
llama_context & lctx,
|
7881
|
-
llama_batch
|
7882
|
-
|
8772
|
+
llama_batch batch_all) { // TODO: rename back to batch
|
8773
|
+
|
8774
|
+
const uint32_t n_tokens_all = batch_all.n_tokens;
|
7883
8775
|
|
7884
|
-
if (
|
8776
|
+
if (n_tokens_all == 0) {
|
7885
8777
|
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
7886
8778
|
return -1;
|
7887
8779
|
}
|
@@ -7890,14 +8782,16 @@ static int llama_decode_internal(
|
|
7890
8782
|
const auto & hparams = model.hparams;
|
7891
8783
|
const auto & cparams = lctx.cparams;
|
7892
8784
|
|
7893
|
-
|
8785
|
+
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
|
7894
8786
|
|
7895
|
-
GGML_ASSERT(
|
7896
|
-
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
8787
|
+
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
7897
8788
|
|
7898
|
-
|
8789
|
+
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
7899
8790
|
|
7900
|
-
|
8791
|
+
if (lctx.t_compute_start_us == 0) {
|
8792
|
+
lctx.t_compute_start_us = ggml_time_us();
|
8793
|
+
}
|
8794
|
+
lctx.n_queued_tokens += n_tokens_all;
|
7901
8795
|
|
7902
8796
|
#ifdef GGML_USE_MPI
|
7903
8797
|
// TODO: needs fix after #3228
|
@@ -7905,213 +8799,274 @@ static int llama_decode_internal(
|
|
7905
8799
|
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
7906
8800
|
#endif
|
7907
8801
|
|
7908
|
-
GGML_ASSERT(n_threads > 0);
|
7909
|
-
|
7910
8802
|
auto & kv_self = lctx.kv_self;
|
7911
8803
|
|
7912
8804
|
const int64_t n_embd = hparams.n_embd;
|
7913
8805
|
const int64_t n_vocab = hparams.n_vocab;
|
7914
8806
|
|
7915
|
-
// helpers for smoother batch API transition
|
7916
|
-
// after deprecating the llama_eval calls, these will be removed
|
7917
|
-
std::vector<llama_pos> pos;
|
7918
8807
|
|
7919
|
-
|
7920
|
-
std::vector<llama_seq_id *> seq_id_arr;
|
7921
|
-
std::vector<std::vector<llama_seq_id>> seq_id;
|
8808
|
+
auto * logits_out = lctx.logits;
|
7922
8809
|
|
7923
|
-
|
7924
|
-
|
7925
|
-
|
7926
|
-
|
7927
|
-
}
|
8810
|
+
#ifndef NDEBUG
|
8811
|
+
auto & logits_valid = lctx.logits_valid;
|
8812
|
+
logits_valid.clear();
|
8813
|
+
logits_valid.resize(n_tokens_all);
|
7928
8814
|
|
7929
|
-
|
7930
|
-
|
8815
|
+
memset(logits_out, 0, lctx.logits_size*sizeof(float));
|
8816
|
+
#endif
|
7931
8817
|
|
7932
|
-
|
7933
|
-
n_seq_id.resize(n_tokens);
|
7934
|
-
seq_id.resize(n_tokens);
|
7935
|
-
seq_id_arr.resize(n_tokens);
|
7936
|
-
for (uint32_t i = 0; i < n_tokens; i++) {
|
7937
|
-
n_seq_id[i] = 1;
|
7938
|
-
seq_id[i].resize(1);
|
7939
|
-
seq_id[i][0] = batch.all_seq_id;
|
7940
|
-
seq_id_arr[i] = seq_id[i].data();
|
7941
|
-
}
|
8818
|
+
const auto n_ubatch = cparams.n_ubatch;
|
7942
8819
|
|
7943
|
-
|
7944
|
-
|
7945
|
-
|
8820
|
+
std::vector<llama_pos> pos;
|
8821
|
+
std::vector<int32_t> n_seq_id;
|
8822
|
+
std::vector<llama_seq_id *> seq_id_arr;
|
8823
|
+
std::vector<std::vector<llama_seq_id>> seq_id;
|
7946
8824
|
|
7947
|
-
|
8825
|
+
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
8826
|
+
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
8827
|
+
llama_batch u_batch = {
|
8828
|
+
/* .n_tokens = */ (int32_t) n_tokens,
|
8829
|
+
/* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr,
|
8830
|
+
/* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr,
|
8831
|
+
/* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr,
|
8832
|
+
/* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr,
|
8833
|
+
/* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr,
|
8834
|
+
/* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr,
|
8835
|
+
/* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
|
8836
|
+
/* .all_pos_1 = */ batch_all.all_pos_1,
|
8837
|
+
/* .all_seq_id = */ batch_all.all_seq_id,
|
8838
|
+
};
|
7948
8839
|
|
7949
|
-
|
7950
|
-
|
7951
|
-
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
7952
|
-
kv_self.head = 0;
|
7953
|
-
}
|
8840
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
8841
|
+
GGML_ASSERT(n_threads > 0);
|
7954
8842
|
|
7955
|
-
|
7956
|
-
|
7957
|
-
|
8843
|
+
// helpers for smoother batch API transition
|
8844
|
+
// after deprecating the llama_eval calls, these will be removed
|
8845
|
+
if (u_batch.pos == nullptr) {
|
8846
|
+
pos.resize(n_tokens);
|
8847
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8848
|
+
pos[i] = u_batch.all_pos_0 + i*u_batch.all_pos_1;
|
8849
|
+
}
|
7958
8850
|
|
7959
|
-
|
7960
|
-
|
7961
|
-
// if we start defragmenting the cache, the benefit from this will be more important
|
7962
|
-
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
7963
|
-
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
8851
|
+
u_batch.pos = pos.data();
|
8852
|
+
}
|
7964
8853
|
|
7965
|
-
|
8854
|
+
if (u_batch.seq_id == nullptr) {
|
8855
|
+
n_seq_id.resize(n_tokens);
|
8856
|
+
seq_id.resize(n_tokens);
|
8857
|
+
seq_id_arr.resize(n_tokens);
|
8858
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8859
|
+
n_seq_id[i] = 1;
|
8860
|
+
seq_id[i].resize(1);
|
8861
|
+
seq_id[i][0] = u_batch.all_seq_id;
|
8862
|
+
seq_id_arr[i] = seq_id[i].data();
|
8863
|
+
}
|
7966
8864
|
|
7967
|
-
|
7968
|
-
|
8865
|
+
u_batch.n_seq_id = n_seq_id.data();
|
8866
|
+
u_batch.seq_id = seq_id_arr.data();
|
8867
|
+
}
|
7969
8868
|
|
7970
|
-
|
8869
|
+
// non-causal masks do not use the KV cache
|
8870
|
+
if (hparams.causal_attn) {
|
8871
|
+
llama_kv_cache_update(&lctx);
|
7971
8872
|
|
7972
|
-
|
7973
|
-
|
7974
|
-
|
8873
|
+
// if we have enough unused cells before the current head ->
|
8874
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
8875
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
8876
|
+
kv_self.head = 0;
|
8877
|
+
}
|
8878
|
+
|
8879
|
+
if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
|
8880
|
+
return 1;
|
8881
|
+
}
|
7975
8882
|
|
7976
|
-
|
7977
|
-
|
7978
|
-
|
7979
|
-
|
7980
|
-
|
8883
|
+
if (!kv_self.recurrent) {
|
8884
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
8885
|
+
// after enough generations, the benefit from this heuristic disappears
|
8886
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
8887
|
+
kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
8888
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
8889
|
+
}
|
7981
8890
|
}
|
7982
|
-
} else if (strcmp(res->name, "result_embd") == 0) {
|
7983
|
-
embeddings = res;
|
7984
|
-
res = nullptr;
|
7985
|
-
} else {
|
7986
|
-
GGML_ASSERT(false);
|
7987
|
-
}
|
7988
8891
|
|
7989
|
-
|
8892
|
+
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
7990
8893
|
|
7991
|
-
|
7992
|
-
|
7993
|
-
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
7994
|
-
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
7995
|
-
// with the BLAS calls. need a better solution
|
7996
|
-
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
7997
|
-
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
7998
|
-
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
7999
|
-
n_threads = std::min(4, n_threads);
|
8000
|
-
}
|
8894
|
+
ggml_backend_sched_reset(lctx.sched);
|
8895
|
+
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
8001
8896
|
|
8002
|
-
|
8897
|
+
ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false);
|
8003
8898
|
|
8004
|
-
|
8899
|
+
// the output is always the last tensor in the graph
|
8900
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
8901
|
+
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
8005
8902
|
|
8006
|
-
|
8007
|
-
|
8008
|
-
kv_self.head += n_tokens;
|
8903
|
+
if (!hparams.causal_attn) {
|
8904
|
+
res = nullptr; // do not extract logits for embedding models such as BERT
|
8009
8905
|
|
8010
|
-
|
8011
|
-
|
8012
|
-
kv_self.head = 0;
|
8013
|
-
}
|
8014
|
-
}
|
8015
|
-
|
8016
|
-
// decide if we need to defrag the kv cache
|
8017
|
-
if (cparams.defrag_thold >= 0.0f) {
|
8018
|
-
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
|
8906
|
+
// token or sequence embeddings
|
8907
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
8019
8908
|
|
8020
|
-
|
8021
|
-
|
8022
|
-
|
8909
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
8910
|
+
} else {
|
8911
|
+
if (strcmp(res->name, "result_output") == 0) {
|
8912
|
+
// the token embeddings could be the second to last tensor, or the third to last tensor
|
8913
|
+
if (strcmp(embd->name, "result_norm") != 0) {
|
8914
|
+
embd = gf->nodes[gf->n_nodes - 3];
|
8915
|
+
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
|
8916
|
+
}
|
8917
|
+
} else {
|
8918
|
+
GGML_ASSERT(false && "missing result_output tensor");
|
8919
|
+
}
|
8920
|
+
}
|
8921
|
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
8023
8922
|
|
8024
|
-
|
8923
|
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
8924
|
+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
8925
|
+
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
8926
|
+
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
8927
|
+
// with the BLAS calls. need a better solution
|
8928
|
+
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
8929
|
+
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
8930
|
+
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
8931
|
+
n_threads = std::min(4, n_threads);
|
8025
8932
|
}
|
8026
|
-
}
|
8027
8933
|
|
8028
|
-
|
8029
|
-
// print timing information per ggml operation (for debugging purposes)
|
8030
|
-
// requires GGML_PERF to be defined
|
8031
|
-
ggml_graph_print(gf);
|
8032
|
-
#endif
|
8934
|
+
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
8033
8935
|
|
8034
|
-
|
8035
|
-
//if (n_past%100 == 0) {
|
8036
|
-
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
8037
|
-
//}
|
8936
|
+
llama_set_inputs(lctx, u_batch);
|
8038
8937
|
|
8039
|
-
|
8040
|
-
// TODO: do not compute and extract logits if only embeddings are needed
|
8041
|
-
// need to update the graphs to skip "result_output"
|
8042
|
-
if (res) {
|
8043
|
-
auto & logits_out = lctx.logits;
|
8938
|
+
llama_graph_compute(lctx, gf, n_threads);
|
8044
8939
|
|
8045
|
-
|
8046
|
-
|
8047
|
-
|
8048
|
-
|
8940
|
+
// update the kv ring buffer
|
8941
|
+
{
|
8942
|
+
kv_self.head += n_tokens;
|
8943
|
+
|
8944
|
+
// Ensure kv cache head points to a valid index.
|
8945
|
+
if (kv_self.head >= kv_self.size) {
|
8946
|
+
kv_self.head = 0;
|
8947
|
+
}
|
8948
|
+
}
|
8049
8949
|
|
8050
|
-
|
8950
|
+
#ifdef GGML_PERF
|
8951
|
+
// print timing information per ggml operation (for debugging purposes)
|
8952
|
+
// requires GGML_PERF to be defined
|
8953
|
+
ggml_graph_print(gf);
|
8051
8954
|
#endif
|
8052
8955
|
|
8053
|
-
|
8054
|
-
|
8055
|
-
|
8056
|
-
|
8057
|
-
|
8058
|
-
|
8059
|
-
|
8060
|
-
|
8061
|
-
|
8956
|
+
// plot the computation graph in dot format (for debugging purposes)
|
8957
|
+
//if (n_past%100 == 0) {
|
8958
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
8959
|
+
//}
|
8960
|
+
|
8961
|
+
// extract logits
|
8962
|
+
// TODO: do not compute and extract logits if only embeddings are needed
|
8963
|
+
// update the graphs to skip "result_output" if logits are not needed
|
8964
|
+
if (res) {
|
8965
|
+
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
8966
|
+
GGML_ASSERT(backend_res != nullptr);
|
8967
|
+
if (u_batch.logits) {
|
8968
|
+
int32_t i_first = -1;
|
8969
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8970
|
+
if (u_batch.logits[i] && i_first == -1) {
|
8971
|
+
i_first = (int32_t) i;
|
8972
|
+
}
|
8973
|
+
if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
|
8974
|
+
if (i_first != -1) {
|
8975
|
+
int i_last = u_batch.logits[i] == 0 ? i : i + 1;
|
8976
|
+
// extract logits for the range [i_first, i_last)
|
8977
|
+
// group the requests to minimize the number of calls to the backend
|
8978
|
+
ggml_backend_tensor_get_async(backend_res, res,
|
8979
|
+
logits_out + n_vocab*(cur_token + i_first),
|
8980
|
+
i_first*n_vocab*sizeof(float),
|
8981
|
+
(i_last - i_first)*n_vocab*sizeof(float));
|
8982
|
+
i_first = -1;
|
8983
|
+
}
|
8984
|
+
}
|
8062
8985
|
#ifndef NDEBUG
|
8063
|
-
|
8986
|
+
logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
|
8064
8987
|
#endif
|
8065
|
-
|
8066
|
-
|
8067
|
-
|
8068
|
-
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
8988
|
+
}
|
8989
|
+
} else if (lctx.logits_all) {
|
8990
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
|
8069
8991
|
#ifndef NDEBUG
|
8070
|
-
|
8992
|
+
std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
|
8071
8993
|
#endif
|
8072
|
-
|
8073
|
-
|
8074
|
-
|
8994
|
+
} else {
|
8995
|
+
if (cur_token + n_tokens >= n_tokens_all) {
|
8996
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
|
8075
8997
|
#ifndef NDEBUG
|
8076
|
-
|
8998
|
+
logits_valid[0] = true;
|
8077
8999
|
#endif
|
9000
|
+
}
|
9001
|
+
}
|
8078
9002
|
}
|
8079
|
-
ggml_backend_synchronize(res_backend);
|
8080
|
-
}
|
8081
9003
|
|
8082
|
-
|
8083
|
-
|
8084
|
-
|
9004
|
+
// extract embeddings
|
9005
|
+
if (cparams.embeddings && embd) {
|
9006
|
+
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
9007
|
+
GGML_ASSERT(backend_embd != nullptr);
|
8085
9008
|
|
8086
|
-
|
8087
|
-
|
9009
|
+
switch (cparams.pooling_type) {
|
9010
|
+
case LLAMA_POOLING_TYPE_NONE:
|
9011
|
+
{
|
9012
|
+
// extract token embeddings
|
9013
|
+
auto & embd_out = lctx.embd;
|
9014
|
+
|
9015
|
+
if (u_batch.logits) {
|
9016
|
+
//embd_out.resize(n_embd * n_tokens);
|
9017
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
9018
|
+
if (u_batch.logits[i] == 0) {
|
9019
|
+
continue;
|
9020
|
+
}
|
9021
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
9022
|
+
}
|
9023
|
+
}
|
9024
|
+
} break;
|
9025
|
+
case LLAMA_POOLING_TYPE_CLS:
|
9026
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
9027
|
+
{
|
9028
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
8088
9029
|
|
8089
|
-
|
8090
|
-
|
8091
|
-
|
8092
|
-
ggml_backend_synchronize(embeddings_backend);
|
8093
|
-
}
|
9030
|
+
// extract sequence embeddings
|
9031
|
+
auto & embd_seq_out = lctx.embd_seq;
|
9032
|
+
embd_seq_out.clear();
|
8094
9033
|
|
8095
|
-
|
8096
|
-
|
8097
|
-
|
8098
|
-
|
8099
|
-
|
8100
|
-
|
8101
|
-
|
8102
|
-
|
9034
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
9035
|
+
const llama_seq_id seq_id = u_batch.seq_id[i][0];
|
9036
|
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
9037
|
+
continue;
|
9038
|
+
}
|
9039
|
+
embd_seq_out[seq_id].resize(n_embd);
|
9040
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
9041
|
+
}
|
9042
|
+
} break;
|
9043
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
9044
|
+
{
|
9045
|
+
GGML_ASSERT(false && "unknown pooling type");
|
9046
|
+
} break;
|
9047
|
+
}
|
9048
|
+
}
|
8103
9049
|
}
|
8104
9050
|
|
8105
|
-
//
|
8106
|
-
//
|
8107
|
-
|
8108
|
-
|
8109
|
-
|
9051
|
+
// wait for the computation to finish (automatically done when obtaining the model output)
|
9052
|
+
//llama_synchronize(&lctx);
|
9053
|
+
|
9054
|
+
// decide if we need to defrag the kv cache
|
9055
|
+
if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
|
9056
|
+
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
|
9057
|
+
|
9058
|
+
// queue defragmentation for next llama_kv_cache_update
|
9059
|
+
if (fragmentation > cparams.defrag_thold) {
|
9060
|
+
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
|
9061
|
+
|
9062
|
+
llama_kv_cache_defrag(kv_self);
|
9063
|
+
}
|
8110
9064
|
}
|
8111
9065
|
|
8112
9066
|
return 0;
|
8113
9067
|
}
|
8114
9068
|
|
9069
|
+
|
8115
9070
|
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
8116
9071
|
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
8117
9072
|
auto & kv_self = lctx.kv_self;
|
@@ -8130,6 +9085,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8130
9085
|
// number of cells moved
|
8131
9086
|
uint32_t n_moves = 0;
|
8132
9087
|
|
9088
|
+
// each move requires 6*n_layer tensors (see build_defrag)
|
9089
|
+
// - source view, destination view, copy operation
|
9090
|
+
// - x2 for keys and values
|
9091
|
+
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
9092
|
+
|
8133
9093
|
// determine which KV cells to move where
|
8134
9094
|
//
|
8135
9095
|
// cell i moves to ids[i]
|
@@ -8156,15 +9116,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8156
9116
|
nh++;
|
8157
9117
|
}
|
8158
9118
|
|
8159
|
-
// each move requires 6*n_layer tensors (see build_defrag)
|
8160
|
-
// - source view, destination view, copy operation
|
8161
|
-
// - x2 for keys and values
|
8162
|
-
//
|
8163
|
-
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
8164
|
-
// the graph is too big, we cannot move more cells
|
8165
|
-
break;
|
8166
|
-
}
|
8167
|
-
|
8168
9119
|
uint32_t nf = 0;
|
8169
9120
|
uint32_t is = n_kv - 1;
|
8170
9121
|
|
@@ -8194,11 +9145,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8194
9145
|
// are we moving a continuous block of memory?
|
8195
9146
|
bool cont = false;
|
8196
9147
|
|
9148
|
+
// should we stop searching for the next move?
|
9149
|
+
bool stop = false;
|
9150
|
+
|
8197
9151
|
// go back and move the nf cells to the hole
|
8198
9152
|
for (; i1 < n_kv; ++i1) {
|
8199
9153
|
auto & cell1 = kv_self.cells[i1];
|
8200
9154
|
|
8201
9155
|
if (cell1.is_empty() || ids[i1] != n_kv) {
|
9156
|
+
if (n_moves == max_moves) {
|
9157
|
+
stop = true;
|
9158
|
+
break;
|
9159
|
+
}
|
9160
|
+
|
8202
9161
|
cont = false;
|
8203
9162
|
continue;
|
8204
9163
|
}
|
@@ -8225,6 +9184,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8225
9184
|
}
|
8226
9185
|
}
|
8227
9186
|
|
9187
|
+
if (stop || n_moves == max_moves) {
|
9188
|
+
break;
|
9189
|
+
}
|
9190
|
+
|
8228
9191
|
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
8229
9192
|
|
8230
9193
|
i0 += nh - 1;
|
@@ -8311,6 +9274,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8311
9274
|
#else
|
8312
9275
|
// ggml_graph defrag
|
8313
9276
|
|
9277
|
+
ggml_backend_sched_reset(lctx.sched);
|
9278
|
+
|
8314
9279
|
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
8315
9280
|
|
8316
9281
|
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
@@ -8322,14 +9287,22 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8322
9287
|
}
|
8323
9288
|
|
8324
9289
|
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
9290
|
+
bool need_reserve = false;
|
9291
|
+
|
8325
9292
|
// apply K-shift if needed
|
8326
9293
|
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
8327
|
-
llama_set_k_shift(lctx);
|
8328
|
-
|
8329
9294
|
{
|
9295
|
+
ggml_backend_sched_reset(lctx.sched);
|
9296
|
+
|
8330
9297
|
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
8331
9298
|
|
9299
|
+
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
9300
|
+
|
9301
|
+
llama_set_k_shift(lctx);
|
9302
|
+
|
8332
9303
|
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
9304
|
+
|
9305
|
+
need_reserve = true;
|
8333
9306
|
}
|
8334
9307
|
|
8335
9308
|
{
|
@@ -8343,12 +9316,56 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|
8343
9316
|
}
|
8344
9317
|
}
|
8345
9318
|
|
9319
|
+
if (lctx.kv_self.recurrent && lctx.kv_self.do_copy) {
|
9320
|
+
{
|
9321
|
+
ggml_backend_sched_reset(lctx.sched);
|
9322
|
+
|
9323
|
+
ggml_cgraph * gf = llama_build_graph_s_copy(lctx);
|
9324
|
+
|
9325
|
+
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
9326
|
+
|
9327
|
+
llama_set_s_copy(lctx);
|
9328
|
+
|
9329
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
9330
|
+
|
9331
|
+
need_reserve = true;
|
9332
|
+
}
|
9333
|
+
|
9334
|
+
{
|
9335
|
+
auto & kv_self = lctx.kv_self;
|
9336
|
+
|
9337
|
+
kv_self.do_copy = false;
|
9338
|
+
|
9339
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
9340
|
+
kv_self.cells[i].src = i;
|
9341
|
+
}
|
9342
|
+
}
|
9343
|
+
}
|
9344
|
+
|
8346
9345
|
// defragment the KV cache if needed
|
8347
9346
|
if (lctx.kv_self.do_defrag) {
|
8348
9347
|
llama_kv_cache_defrag_internal(lctx);
|
8349
9348
|
|
9349
|
+
need_reserve = true;
|
9350
|
+
|
8350
9351
|
lctx.kv_self.do_defrag = false;
|
8351
9352
|
}
|
9353
|
+
|
9354
|
+
// reserve a worst case graph again
|
9355
|
+
if (need_reserve) {
|
9356
|
+
// TODO: extract to a function
|
9357
|
+
// build worst-case graph
|
9358
|
+
int n_tokens = (int)std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
|
9359
|
+
int n_past = lctx.cparams.n_ctx - n_tokens;
|
9360
|
+
llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
9361
|
+
ggml_cgraph * gf = llama_build_graph(lctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
9362
|
+
|
9363
|
+
// initialize scheduler with the worst-case graph
|
9364
|
+
ggml_backend_sched_reset(lctx.sched);
|
9365
|
+
if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
|
9366
|
+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
9367
|
+
}
|
9368
|
+
}
|
8352
9369
|
}
|
8353
9370
|
|
8354
9371
|
//
|
@@ -8360,46 +9377,53 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
8360
9377
|
}
|
8361
9378
|
|
8362
9379
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
9380
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8363
9381
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
8364
9382
|
}
|
8365
9383
|
|
8366
9384
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
9385
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8367
9386
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
8368
9387
|
}
|
8369
9388
|
|
8370
9389
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
9390
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8371
9391
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
8372
9392
|
}
|
8373
9393
|
|
8374
9394
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
9395
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8375
9396
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
8376
9397
|
}
|
8377
9398
|
|
8378
9399
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
9400
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8379
9401
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
8380
9402
|
}
|
8381
9403
|
|
8382
9404
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
9405
|
+
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
8383
9406
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
8384
9407
|
const auto& token_data = vocab.id_to_token.at(id);
|
8385
9408
|
switch (llama_vocab_get_type(vocab)) {
|
8386
|
-
|
8387
|
-
|
8388
|
-
|
8389
|
-
|
8390
|
-
|
8391
|
-
|
8392
|
-
|
8393
|
-
|
8394
|
-
|
8395
|
-
|
8396
|
-
|
8397
|
-
|
8398
|
-
|
9409
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
9410
|
+
auto buf = token_data.text.substr(3, 2);
|
9411
|
+
return strtol(buf.c_str(), NULL, 16);
|
9412
|
+
}
|
9413
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
9414
|
+
GGML_ASSERT(false);
|
9415
|
+
return unicode_utf8_to_byte(token_data.text);
|
9416
|
+
}
|
9417
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
9418
|
+
GGML_ASSERT(false);
|
9419
|
+
}
|
9420
|
+
default:
|
9421
|
+
GGML_ASSERT(false);
|
8399
9422
|
}
|
8400
9423
|
}
|
8401
9424
|
|
8402
9425
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
9426
|
+
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
8403
9427
|
static const char * hex = "0123456789ABCDEF";
|
8404
9428
|
switch (llama_vocab_get_type(vocab)) {
|
8405
9429
|
case LLAMA_VOCAB_TYPE_SPM: {
|
@@ -8414,7 +9438,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
8414
9438
|
}
|
8415
9439
|
case LLAMA_VOCAB_TYPE_WPM:
|
8416
9440
|
case LLAMA_VOCAB_TYPE_BPE: {
|
8417
|
-
return vocab.token_to_id.at(
|
9441
|
+
return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
|
8418
9442
|
}
|
8419
9443
|
default:
|
8420
9444
|
GGML_ASSERT(false);
|
@@ -8754,9 +9778,9 @@ private:
|
|
8754
9778
|
bpe_words.reserve(text.size());
|
8755
9779
|
bpe_encoded_words.reserve(text.size());
|
8756
9780
|
|
8757
|
-
auto
|
8758
|
-
for (size_t i = 0; i <
|
8759
|
-
text_utf.emplace_back(
|
9781
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
9782
|
+
for (size_t i = 0; i < cpts.size(); ++i)
|
9783
|
+
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
8760
9784
|
|
8761
9785
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
8762
9786
|
const std::string & utf_char = text_utf[i];
|
@@ -8806,40 +9830,40 @@ private:
|
|
8806
9830
|
}
|
8807
9831
|
|
8808
9832
|
if (!split_condition && !collecting) {
|
8809
|
-
if (
|
9833
|
+
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
8810
9834
|
collecting_letter = true;
|
8811
9835
|
collecting = true;
|
8812
9836
|
}
|
8813
|
-
else if (
|
9837
|
+
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
8814
9838
|
collecting_numeric = true;
|
8815
9839
|
collecting = true;
|
8816
9840
|
}
|
8817
9841
|
else if (
|
8818
|
-
((
|
8819
|
-
(!token.size() && utf_char == " " &&
|
9842
|
+
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
9843
|
+
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
8820
9844
|
) {
|
8821
9845
|
collecting_special = true;
|
8822
9846
|
collecting = true;
|
8823
9847
|
}
|
8824
|
-
else if (
|
9848
|
+
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
8825
9849
|
collecting_whitespace_lookahead = true;
|
8826
9850
|
collecting = true;
|
8827
9851
|
}
|
8828
|
-
else if (
|
9852
|
+
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
8829
9853
|
split_condition = true;
|
8830
9854
|
}
|
8831
9855
|
}
|
8832
9856
|
else if (!split_condition && collecting) {
|
8833
|
-
if (collecting_letter &&
|
9857
|
+
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
8834
9858
|
split_condition = true;
|
8835
9859
|
}
|
8836
|
-
else if (collecting_numeric &&
|
9860
|
+
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
8837
9861
|
split_condition = true;
|
8838
9862
|
}
|
8839
|
-
else if (collecting_special && (
|
9863
|
+
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
8840
9864
|
split_condition = true;
|
8841
9865
|
}
|
8842
|
-
else if (collecting_whitespace_lookahead && (
|
9866
|
+
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
8843
9867
|
split_condition = true;
|
8844
9868
|
}
|
8845
9869
|
}
|
@@ -8868,7 +9892,7 @@ private:
|
|
8868
9892
|
for (std::string & word : bpe_words) {
|
8869
9893
|
std::string encoded_token = "";
|
8870
9894
|
for (char & c : word) {
|
8871
|
-
encoded_token +=
|
9895
|
+
encoded_token += unicode_byte_to_utf8(c);
|
8872
9896
|
}
|
8873
9897
|
bpe_encoded_words.emplace_back(encoded_token);
|
8874
9898
|
}
|
@@ -8942,25 +9966,13 @@ struct llm_tokenizer_wpm {
|
|
8942
9966
|
}
|
8943
9967
|
|
8944
9968
|
std::vector<std::string> preprocess(const std::string & text) {
|
8945
|
-
|
8946
|
-
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
8947
|
-
std::vector<uint32_t> nfd_codepoints;
|
8948
|
-
for (uint32_t code : codepoints) {
|
8949
|
-
auto it = nfd_map.equal_range(code);
|
8950
|
-
if (it.first != it.second) {
|
8951
|
-
for (auto jt = it.first; jt != it.second; jt++) {
|
8952
|
-
nfd_codepoints.push_back(jt->second);
|
8953
|
-
}
|
8954
|
-
} else {
|
8955
|
-
nfd_codepoints.push_back(code);
|
8956
|
-
}
|
8957
|
-
}
|
9969
|
+
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
8958
9970
|
|
8959
9971
|
// strip accents, strip control, uniformize whitespace,
|
8960
9972
|
// to lowercase, pad chinese characters, pad punctuation
|
8961
9973
|
std::string new_str = "";
|
8962
|
-
for (uint32_t code :
|
8963
|
-
int type =
|
9974
|
+
for (uint32_t code : cpts_nfd) {
|
9975
|
+
int type = unicode_cpt_type(code);
|
8964
9976
|
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
8965
9977
|
continue;
|
8966
9978
|
}
|
@@ -8968,7 +9980,7 @@ struct llm_tokenizer_wpm {
|
|
8968
9980
|
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
8969
9981
|
code = ' ';
|
8970
9982
|
}
|
8971
|
-
std::string s =
|
9983
|
+
std::string s = unicode_cpt_to_utf8(code);
|
8972
9984
|
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
|
8973
9985
|
new_str += " ";
|
8974
9986
|
new_str += s;
|
@@ -8988,8 +10000,7 @@ struct llm_tokenizer_wpm {
|
|
8988
10000
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
8989
10001
|
l = r + 1;
|
8990
10002
|
r = l;
|
8991
|
-
}
|
8992
|
-
else {
|
10003
|
+
} else {
|
8993
10004
|
r += 1;
|
8994
10005
|
}
|
8995
10006
|
}
|
@@ -9013,17 +10024,17 @@ struct llm_tokenizer_wpm {
|
|
9013
10024
|
return code < 256 && ispunct(code);
|
9014
10025
|
}
|
9015
10026
|
|
9016
|
-
bool is_chinese_char(uint32_t
|
9017
|
-
if ((
|
9018
|
-
(
|
9019
|
-
(
|
9020
|
-
(
|
9021
|
-
(
|
9022
|
-
(
|
9023
|
-
(
|
9024
|
-
(
|
9025
|
-
(
|
9026
|
-
(
|
10027
|
+
bool is_chinese_char(uint32_t cpt) {
|
10028
|
+
if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
|
10029
|
+
(cpt >= 0x3400 && cpt <= 0x4DBF) ||
|
10030
|
+
(cpt >= 0x20000 && cpt <= 0x2A6DF) ||
|
10031
|
+
(cpt >= 0x2A700 && cpt <= 0x2B73F) ||
|
10032
|
+
(cpt >= 0x2B740 && cpt <= 0x2B81F) ||
|
10033
|
+
(cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
10034
|
+
(cpt >= 0xF900 && cpt <= 0xFAFF) ||
|
10035
|
+
(cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
|
10036
|
+
(cpt >= 0x3000 && cpt <= 0x303F) ||
|
10037
|
+
(cpt >= 0xFF00 && cpt <= 0xFFEF)) {
|
9027
10038
|
return true; // NOLINT
|
9028
10039
|
}
|
9029
10040
|
return false;
|
@@ -9244,6 +10255,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
9244
10255
|
}
|
9245
10256
|
}
|
9246
10257
|
} break;
|
10258
|
+
case LLAMA_VOCAB_TYPE_NONE:
|
10259
|
+
GGML_ASSERT(false);
|
9247
10260
|
}
|
9248
10261
|
|
9249
10262
|
return output;
|
@@ -9600,7 +10613,7 @@ struct llama_grammar * llama_grammar_init(
|
|
9600
10613
|
|
9601
10614
|
// loop over alternates of start rule to build initial stacks
|
9602
10615
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
9603
|
-
pos =
|
10616
|
+
pos = vec_rules[start_rule_index].data();
|
9604
10617
|
do {
|
9605
10618
|
std::vector<const llama_grammar_element *> stack;
|
9606
10619
|
if (!llama_grammar_is_end_of_sequence(pos)) {
|
@@ -10615,13 +11628,16 @@ struct quantize_state_internal {
|
|
10615
11628
|
|
10616
11629
|
bool has_imatrix = false;
|
10617
11630
|
|
11631
|
+
// used to figure out if a model shares tok_embd with the output weight
|
11632
|
+
bool has_output = false;
|
11633
|
+
|
10618
11634
|
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
10619
11635
|
: model(model)
|
10620
11636
|
, params(params)
|
10621
11637
|
{}
|
10622
11638
|
};
|
10623
11639
|
|
10624
|
-
static void
|
11640
|
+
static void llama_tensor_dequantize_internal(
|
10625
11641
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
10626
11642
|
const size_t nelements, const int nthread
|
10627
11643
|
) {
|
@@ -10682,7 +11698,7 @@ static void llama_convert_tensor_internal(
|
|
10682
11698
|
workers.clear();
|
10683
11699
|
}
|
10684
11700
|
|
10685
|
-
static ggml_type
|
11701
|
+
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
10686
11702
|
const std::string name = ggml_get_name(tensor);
|
10687
11703
|
|
10688
11704
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
@@ -10712,8 +11728,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10712
11728
|
|
10713
11729
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
10714
11730
|
// with the quantization of the output tensor
|
10715
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
|
10716
|
-
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
|
11731
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
10717
11732
|
int nx = tensor->ne[0];
|
10718
11733
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
10719
11734
|
new_type = GGML_TYPE_Q8_0;
|
@@ -10962,41 +11977,76 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10962
11977
|
return new_type;
|
10963
11978
|
}
|
10964
11979
|
|
11980
|
+
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
11981
|
+
std::mutex mutex;
|
11982
|
+
int counter = 0;
|
11983
|
+
size_t new_size = 0;
|
11984
|
+
if (nthread < 2) {
|
11985
|
+
// single-thread
|
11986
|
+
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
11987
|
+
}
|
11988
|
+
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
11989
|
+
nrows, n_per_row, imatrix]() {
|
11990
|
+
const int nrows_per_chunk = chunk_size / n_per_row;
|
11991
|
+
size_t local_size = 0;
|
11992
|
+
while (true) {
|
11993
|
+
std::unique_lock<std::mutex> lock(mutex);
|
11994
|
+
int first_row = counter; counter += nrows_per_chunk;
|
11995
|
+
if (first_row >= nrows) {
|
11996
|
+
if (local_size > 0) {
|
11997
|
+
new_size += local_size;
|
11998
|
+
}
|
11999
|
+
break;
|
12000
|
+
}
|
12001
|
+
lock.unlock();
|
12002
|
+
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
12003
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
12004
|
+
}
|
12005
|
+
};
|
12006
|
+
for (int it = 0; it < nthread - 1; ++it) {
|
12007
|
+
workers.emplace_back(compute);
|
12008
|
+
}
|
12009
|
+
compute();
|
12010
|
+
for (auto & w : workers) { w.join(); }
|
12011
|
+
workers.clear();
|
12012
|
+
return new_size;
|
12013
|
+
}
|
12014
|
+
|
10965
12015
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
10966
|
-
ggml_type
|
12016
|
+
ggml_type default_type;
|
10967
12017
|
llama_ftype ftype = params->ftype;
|
10968
12018
|
|
10969
12019
|
switch (params->ftype) {
|
10970
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0:
|
10971
|
-
case LLAMA_FTYPE_MOSTLY_Q4_1:
|
10972
|
-
case LLAMA_FTYPE_MOSTLY_Q5_0:
|
10973
|
-
case LLAMA_FTYPE_MOSTLY_Q5_1:
|
10974
|
-
case LLAMA_FTYPE_MOSTLY_Q8_0:
|
10975
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
10976
|
-
case LLAMA_FTYPE_ALL_F32:
|
12020
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
|
12021
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
|
12022
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
|
12023
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
12024
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
12025
|
+
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
12026
|
+
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
10977
12027
|
|
10978
12028
|
// K-quants
|
10979
12029
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
10980
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
10981
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XS:
|
12030
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
|
12031
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
|
10982
12032
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
10983
12033
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
10984
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
12034
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
|
10985
12035
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
10986
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
12036
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
|
10987
12037
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
10988
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
10989
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
10990
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
|
10991
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS:
|
10992
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_S:
|
10993
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_M:
|
10994
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
|
10995
|
-
case LLAMA_FTYPE_MOSTLY_IQ1_S:
|
10996
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_NL:
|
10997
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_XS:
|
10998
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
10999
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_M:
|
12038
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
|
12039
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
|
12040
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
|
12041
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
|
12042
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
|
12043
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
|
12044
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
|
12045
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
|
12046
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
|
12047
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
12048
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
12049
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
11000
12050
|
|
11001
12051
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
11002
12052
|
}
|
@@ -11062,6 +12112,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11062
12112
|
else if (name.find("ffn_up") != std::string::npos) {
|
11063
12113
|
++qs.n_ffn_up;
|
11064
12114
|
}
|
12115
|
+
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
12116
|
+
qs.has_output = true;
|
12117
|
+
}
|
11065
12118
|
}
|
11066
12119
|
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
|
11067
12120
|
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
|
@@ -11070,11 +12123,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11070
12123
|
|
11071
12124
|
size_t total_size_org = 0;
|
11072
12125
|
size_t total_size_new = 0;
|
11073
|
-
std::vector<int64_t> hist_all(1 << 4, 0);
|
11074
12126
|
|
11075
12127
|
std::vector<std::thread> workers;
|
11076
12128
|
workers.reserve(nthread);
|
11077
|
-
std::mutex mutex;
|
11078
12129
|
|
11079
12130
|
int idx = 0;
|
11080
12131
|
|
@@ -11133,20 +12184,29 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11133
12184
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
11134
12185
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
11135
12186
|
|
12187
|
+
// do not quantize Mamba's small yet 2D weights
|
12188
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
12189
|
+
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
12190
|
+
quantize &= name.find("ssm_x.weight") == std::string::npos;
|
12191
|
+
quantize &= name.find("ssm_dt.weight") == std::string::npos;
|
12192
|
+
|
11136
12193
|
enum ggml_type new_type;
|
11137
12194
|
void * new_data;
|
11138
12195
|
size_t new_size;
|
11139
12196
|
|
11140
12197
|
if (quantize) {
|
11141
|
-
new_type =
|
11142
|
-
|
11143
|
-
|
12198
|
+
new_type = default_type;
|
12199
|
+
|
12200
|
+
// get more optimal quantization type based on the tensor shape, layer, etc.
|
12201
|
+
if (!params->pure && ggml_is_quantized(default_type)) {
|
12202
|
+
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
11144
12203
|
}
|
11145
12204
|
|
11146
12205
|
// If we've decided to quantize to the same type the tensor is already
|
11147
12206
|
// in then there's nothing to do.
|
11148
12207
|
quantize = tensor->type != new_type;
|
11149
12208
|
}
|
12209
|
+
|
11150
12210
|
if (!quantize) {
|
11151
12211
|
new_type = tensor->type;
|
11152
12212
|
new_data = tensor->data;
|
@@ -11188,18 +12248,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11188
12248
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
11189
12249
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
11190
12250
|
} else {
|
11191
|
-
|
12251
|
+
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
11192
12252
|
f32_data = (float *) f32_conv_buf.data();
|
11193
12253
|
}
|
11194
12254
|
|
11195
|
-
LLAMA_LOG_INFO("
|
12255
|
+
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
11196
12256
|
fflush(stdout);
|
11197
12257
|
|
11198
12258
|
if (work.size() < nelements * 4) {
|
11199
12259
|
work.resize(nelements * 4); // upper bound on size
|
11200
12260
|
}
|
11201
12261
|
new_data = work.data();
|
11202
|
-
std::array<int64_t, 1 << 4> hist_cur = {};
|
11203
12262
|
|
11204
12263
|
const int n_per_row = tensor->ne[0];
|
11205
12264
|
const int nrows = nelements / n_per_row;
|
@@ -11209,56 +12268,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11209
12268
|
|
11210
12269
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
11211
12270
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
11212
|
-
|
11213
|
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
11214
|
-
} else {
|
11215
|
-
int counter = 0;
|
11216
|
-
new_size = 0;
|
11217
|
-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
11218
|
-
nrows, n_per_row, imatrix]() {
|
11219
|
-
std::array<int64_t, 1 << 4> local_hist = {};
|
11220
|
-
const int nrows_per_chunk = chunk_size / n_per_row;
|
11221
|
-
size_t local_size = 0;
|
11222
|
-
while (true) {
|
11223
|
-
std::unique_lock<std::mutex> lock(mutex);
|
11224
|
-
int first_row = counter; counter += nrows_per_chunk;
|
11225
|
-
if (first_row >= nrows) {
|
11226
|
-
if (local_size > 0) {
|
11227
|
-
for (int j=0; j<int(local_hist.size()); ++j) {
|
11228
|
-
hist_cur[j] += local_hist[j];
|
11229
|
-
}
|
11230
|
-
new_size += local_size;
|
11231
|
-
}
|
11232
|
-
break;
|
11233
|
-
}
|
11234
|
-
lock.unlock();
|
11235
|
-
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
11236
|
-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
11237
|
-
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
11238
|
-
}
|
11239
|
-
};
|
11240
|
-
for (int it = 0; it < nthread_use - 1; ++it) {
|
11241
|
-
workers.emplace_back(compute);
|
11242
|
-
}
|
11243
|
-
compute();
|
11244
|
-
for (auto & w : workers) { w.join(); }
|
11245
|
-
workers.clear();
|
11246
|
-
}
|
11247
|
-
|
11248
|
-
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
11249
|
-
int64_t tot_count = 0;
|
11250
|
-
for (size_t i = 0; i < hist_cur.size(); i++) {
|
11251
|
-
hist_all[i] += hist_cur[i];
|
11252
|
-
tot_count += hist_cur[i];
|
11253
|
-
}
|
12271
|
+
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
|
11254
12272
|
|
11255
|
-
|
11256
|
-
LLAMA_LOG_INFO(" | hist: ");
|
11257
|
-
for (size_t i = 0; i < hist_cur.size(); i++) {
|
11258
|
-
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
11259
|
-
}
|
11260
|
-
}
|
11261
|
-
LLAMA_LOG_INFO("\n");
|
12273
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
11262
12274
|
}
|
11263
12275
|
total_size_org += ggml_nbytes(tensor);
|
11264
12276
|
total_size_new += new_size;
|
@@ -11287,24 +12299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11287
12299
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
11288
12300
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
11289
12301
|
|
11290
|
-
// print histogram for all tensors
|
11291
|
-
{
|
11292
|
-
int64_t sum_all = 0;
|
11293
|
-
for (size_t i = 0; i < hist_all.size(); i++) {
|
11294
|
-
sum_all += hist_all[i];
|
11295
|
-
}
|
11296
|
-
|
11297
|
-
if (sum_all > 0) {
|
11298
|
-
LLAMA_LOG_INFO("%s: hist: ", __func__);
|
11299
|
-
for (size_t i = 0; i < hist_all.size(); i++) {
|
11300
|
-
LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
|
11301
|
-
}
|
11302
|
-
LLAMA_LOG_INFO("\n");
|
11303
|
-
}
|
11304
|
-
}
|
11305
|
-
|
11306
12302
|
if (qs.n_fallback > 0) {
|
11307
|
-
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s)
|
12303
|
+
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
11308
12304
|
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
|
11309
12305
|
}
|
11310
12306
|
}
|
@@ -11616,10 +12612,13 @@ struct llama_context_params llama_context_default_params() {
|
|
11616
12612
|
struct llama_context_params result = {
|
11617
12613
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
11618
12614
|
/*.n_ctx =*/ 512,
|
11619
|
-
/*.n_batch =*/
|
12615
|
+
/*.n_batch =*/ 2048,
|
12616
|
+
/*.n_ubatch =*/ 512,
|
12617
|
+
/*.n_seq_max =*/ 1,
|
11620
12618
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
11621
12619
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
11622
12620
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
12621
|
+
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
11623
12622
|
/*.rope_freq_base =*/ 0.0f,
|
11624
12623
|
/*.rope_freq_scale =*/ 0.0f,
|
11625
12624
|
/*.yarn_ext_factor =*/ -1.0f,
|
@@ -11633,9 +12632,10 @@ struct llama_context_params llama_context_default_params() {
|
|
11633
12632
|
/*.type_k =*/ GGML_TYPE_F16,
|
11634
12633
|
/*.type_v =*/ GGML_TYPE_F16,
|
11635
12634
|
/*.logits_all =*/ false,
|
11636
|
-
/*.
|
12635
|
+
/*.embeddings =*/ false,
|
11637
12636
|
/*.offload_kqv =*/ true,
|
11638
|
-
/*.
|
12637
|
+
/*.abort_callback =*/ nullptr,
|
12638
|
+
/*.abort_callback_data =*/ nullptr,
|
11639
12639
|
};
|
11640
12640
|
|
11641
12641
|
return result;
|
@@ -11767,6 +12767,17 @@ struct llama_context * llama_new_context_with_model(
|
|
11767
12767
|
struct llama_context_params params) {
|
11768
12768
|
|
11769
12769
|
if (!model) {
|
12770
|
+
LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
|
12771
|
+
return nullptr;
|
12772
|
+
}
|
12773
|
+
|
12774
|
+
if (params.n_batch == 0 && params.n_ubatch == 0) {
|
12775
|
+
LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
|
12776
|
+
return nullptr;
|
12777
|
+
}
|
12778
|
+
|
12779
|
+
if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
|
12780
|
+
LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
|
11770
12781
|
return nullptr;
|
11771
12782
|
}
|
11772
12783
|
|
@@ -11775,7 +12786,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11775
12786
|
const auto & hparams = model->hparams;
|
11776
12787
|
auto & cparams = ctx->cparams;
|
11777
12788
|
|
11778
|
-
|
12789
|
+
// TODO: maybe add n_seq_max here too
|
11779
12790
|
cparams.n_threads = params.n_threads;
|
11780
12791
|
cparams.n_threads_batch = params.n_threads_batch;
|
11781
12792
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
@@ -11783,13 +12794,19 @@ struct llama_context * llama_new_context_with_model(
|
|
11783
12794
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
11784
12795
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
11785
12796
|
cparams.defrag_thold = params.defrag_thold;
|
12797
|
+
cparams.embeddings = params.embeddings;
|
11786
12798
|
cparams.offload_kqv = params.offload_kqv;
|
11787
|
-
cparams.
|
12799
|
+
cparams.pooling_type = params.pooling_type;
|
11788
12800
|
|
11789
12801
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
11790
12802
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
11791
12803
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
11792
12804
|
|
12805
|
+
// with causal attention, the batch size is limited by the context size
|
12806
|
+
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
12807
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
12808
|
+
|
12809
|
+
|
11793
12810
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
11794
12811
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
11795
12812
|
hparams.n_ctx_train;
|
@@ -11810,19 +12827,44 @@ struct llama_context * llama_new_context_with_model(
|
|
11810
12827
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
11811
12828
|
}
|
11812
12829
|
|
12830
|
+
cparams.causal_attn = hparams.causal_attn;
|
12831
|
+
|
12832
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
12833
|
+
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
12834
|
+
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
12835
|
+
} else {
|
12836
|
+
cparams.pooling_type = hparams.pooling_type;
|
12837
|
+
}
|
12838
|
+
}
|
12839
|
+
|
11813
12840
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
11814
12841
|
params.seed = time(NULL);
|
11815
12842
|
}
|
11816
12843
|
|
11817
12844
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
12845
|
+
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
12846
|
+
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
11818
12847
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
11819
12848
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
11820
12849
|
|
11821
|
-
ctx->
|
11822
|
-
ctx->
|
12850
|
+
ctx->abort_callback = params.abort_callback;
|
12851
|
+
ctx->abort_callback_data = params.abort_callback_data;
|
12852
|
+
|
12853
|
+
ctx->rng = std::mt19937(params.seed);
|
12854
|
+
ctx->logits_all = params.logits_all;
|
11823
12855
|
|
11824
|
-
|
11825
|
-
|
12856
|
+
uint32_t kv_size = cparams.n_ctx;
|
12857
|
+
ggml_type type_k = params.type_k;
|
12858
|
+
ggml_type type_v = params.type_v;
|
12859
|
+
|
12860
|
+
// Mamba only needs a constant number of KV cache cells per sequence
|
12861
|
+
if (model->arch == LLM_ARCH_MAMBA) {
|
12862
|
+
// Mamba needs at least as many KV cells as there are sequences kept at any time
|
12863
|
+
kv_size = std::max((uint32_t) 1, params.n_seq_max);
|
12864
|
+
// it's probably best to keep as much precision as possible for the states
|
12865
|
+
type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
|
12866
|
+
type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
|
12867
|
+
}
|
11826
12868
|
|
11827
12869
|
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
11828
12870
|
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
@@ -11877,13 +12919,31 @@ struct llama_context * llama_new_context_with_model(
|
|
11877
12919
|
}
|
11878
12920
|
#elif defined(GGML_USE_SYCL)
|
11879
12921
|
if (model->n_gpu_layers > 0) {
|
11880
|
-
|
11881
|
-
if (
|
11882
|
-
|
11883
|
-
|
11884
|
-
|
12922
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
12923
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
12924
|
+
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
12925
|
+
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
12926
|
+
if (backend == nullptr) {
|
12927
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
12928
|
+
llama_free(ctx);
|
12929
|
+
return nullptr;
|
12930
|
+
}
|
12931
|
+
ctx->backends.push_back(backend);
|
12932
|
+
} else {
|
12933
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
12934
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
12935
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
12936
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
12937
|
+
int device_id = id_list[i];
|
12938
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
12939
|
+
if (backend == nullptr) {
|
12940
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
12941
|
+
llama_free(ctx);
|
12942
|
+
return nullptr;
|
12943
|
+
}
|
12944
|
+
ctx->backends.push_back(backend);
|
12945
|
+
}
|
11885
12946
|
}
|
11886
|
-
ctx->backends.push_back(backend);
|
11887
12947
|
}
|
11888
12948
|
#elif defined(GGML_USE_KOMPUTE)
|
11889
12949
|
if (model->n_gpu_layers > 0) {
|
@@ -11904,7 +12964,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11904
12964
|
}
|
11905
12965
|
ctx->backends.push_back(ctx->backend_cpu);
|
11906
12966
|
|
11907
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
12967
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
11908
12968
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
11909
12969
|
llama_free(ctx);
|
11910
12970
|
return nullptr;
|
@@ -11928,45 +12988,31 @@ struct llama_context * llama_new_context_with_model(
|
|
11928
12988
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
11929
12989
|
}
|
11930
12990
|
|
11931
|
-
//
|
11932
|
-
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
11933
|
-
|
11934
|
-
if (params.embedding) {
|
11935
|
-
ctx->embedding.resize(hparams.n_embd);
|
11936
|
-
}
|
11937
|
-
|
11938
|
-
// graph inputs
|
12991
|
+
// graph outputs buffer
|
11939
12992
|
{
|
11940
|
-
|
11941
|
-
|
11942
|
-
|
11943
|
-
/* .no_alloc */ true,
|
11944
|
-
};
|
11945
|
-
ctx->ctx_input = ggml_init(init_params);
|
12993
|
+
// resized during inference, reserve maximum
|
12994
|
+
ctx->logits_size = hparams.n_vocab*cparams.n_batch;
|
12995
|
+
ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
|
11946
12996
|
|
11947
|
-
|
11948
|
-
|
11949
|
-
ctx->
|
11950
|
-
|
11951
|
-
|
11952
|
-
|
11953
|
-
|
11954
|
-
|
12997
|
+
const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
|
12998
|
+
|
12999
|
+
ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
|
13000
|
+
if (ctx->buf_output == nullptr) {
|
13001
|
+
LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
|
13002
|
+
llama_free(ctx);
|
13003
|
+
return nullptr;
|
13004
|
+
}
|
13005
|
+
ggml_backend_buffer_clear(ctx->buf_output, 0);
|
11955
13006
|
|
11956
|
-
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
11957
|
-
ggml_set_name(ctx->inp_embd, "inp_embd");
|
11958
|
-
ggml_set_name(ctx->inp_pos, "inp_pos");
|
11959
|
-
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
11960
|
-
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
11961
|
-
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
11962
|
-
ggml_set_name(ctx->inp_mean, "inp_mean");
|
11963
|
-
ggml_set_name(ctx->inp_cls, "inp_cls");
|
11964
13007
|
|
11965
|
-
ctx->
|
13008
|
+
ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
|
13009
|
+
if (params.embeddings) {
|
13010
|
+
ctx->embd = ctx->logits + ctx->logits_size;
|
13011
|
+
}
|
11966
13012
|
|
11967
|
-
LLAMA_LOG_INFO("%s: %10s
|
11968
|
-
ggml_backend_buffer_name(ctx->
|
11969
|
-
ggml_backend_buffer_get_size(ctx->
|
13013
|
+
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
|
13014
|
+
ggml_backend_buffer_name(ctx->buf_output),
|
13015
|
+
ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
|
11970
13016
|
}
|
11971
13017
|
|
11972
13018
|
// scheduler and compute buffers
|
@@ -11985,10 +13031,21 @@ struct llama_context * llama_new_context_with_model(
|
|
11985
13031
|
// buffer used to store the computation graph and the tensor meta data
|
11986
13032
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
11987
13033
|
|
11988
|
-
|
13034
|
+
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
13035
|
+
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
13036
|
+
#ifndef GGML_USE_CUBLAS
|
13037
|
+
// pipeline parallelism requires support for async compute and events
|
13038
|
+
// currently this is only implemented in the CUDA backend
|
13039
|
+
pipeline_parallel = false;
|
13040
|
+
#endif
|
13041
|
+
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
|
13042
|
+
|
13043
|
+
if (pipeline_parallel) {
|
13044
|
+
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
|
13045
|
+
}
|
11989
13046
|
|
11990
13047
|
// build worst-case graph
|
11991
|
-
int n_tokens = (int)std::min(cparams.n_ctx, cparams.
|
13048
|
+
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
|
11992
13049
|
int n_past = cparams.n_ctx - n_tokens;
|
11993
13050
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
11994
13051
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
@@ -12011,7 +13068,7 @@ struct llama_context * llama_new_context_with_model(
|
|
12011
13068
|
|
12012
13069
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
12013
13070
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
12014
|
-
LLAMA_LOG_INFO("%s: graph splits
|
13071
|
+
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
|
12015
13072
|
}
|
12016
13073
|
}
|
12017
13074
|
|
@@ -12048,6 +13105,14 @@ uint32_t llama_n_batch(const struct llama_context * ctx) {
|
|
12048
13105
|
return ctx->cparams.n_batch;
|
12049
13106
|
}
|
12050
13107
|
|
13108
|
+
uint32_t llama_n_ubatch(const struct llama_context * ctx) {
|
13109
|
+
return ctx->cparams.n_ubatch;
|
13110
|
+
}
|
13111
|
+
|
13112
|
+
uint32_t llama_n_seq_max(const struct llama_context * ctx) {
|
13113
|
+
return ctx->kv_self.size;
|
13114
|
+
}
|
13115
|
+
|
12051
13116
|
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
12052
13117
|
return model->vocab.type;
|
12053
13118
|
}
|
@@ -12061,6 +13126,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
12061
13126
|
case LLM_ARCH_MPT:
|
12062
13127
|
case LLM_ARCH_REFACT:
|
12063
13128
|
case LLM_ARCH_BLOOM:
|
13129
|
+
case LLM_ARCH_MAMBA:
|
12064
13130
|
return LLAMA_ROPE_TYPE_NONE;
|
12065
13131
|
|
12066
13132
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -12084,6 +13150,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
12084
13150
|
case LLM_ARCH_QWEN2:
|
12085
13151
|
case LLM_ARCH_PHI2:
|
12086
13152
|
case LLM_ARCH_GEMMA:
|
13153
|
+
case LLM_ARCH_STARCODER2:
|
12087
13154
|
return LLAMA_ROPE_TYPE_NEOX;
|
12088
13155
|
|
12089
13156
|
// all model arches should be listed explicitly here
|
@@ -12096,7 +13163,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
12096
13163
|
}
|
12097
13164
|
|
12098
13165
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
12099
|
-
return model->
|
13166
|
+
return model->hparams.n_vocab;
|
12100
13167
|
}
|
12101
13168
|
|
12102
13169
|
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
@@ -12206,10 +13273,10 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
|
|
12206
13273
|
}
|
12207
13274
|
}
|
12208
13275
|
|
12209
|
-
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t
|
13276
|
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
|
12210
13277
|
struct llama_kv_cache_view result = {
|
12211
13278
|
/*.n_cells = */ 0,
|
12212
|
-
/*.
|
13279
|
+
/*.n_seq_max = */ n_seq_max,
|
12213
13280
|
/*.token_count = */ 0,
|
12214
13281
|
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
12215
13282
|
/*.max_contiguous = */ 0,
|
@@ -12237,7 +13304,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
12237
13304
|
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
12238
13305
|
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
12239
13306
|
view->cells = (struct llama_kv_cache_view_cell *)p;
|
12240
|
-
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->
|
13307
|
+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
|
12241
13308
|
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
12242
13309
|
view->cells_sequences = (llama_seq_id *)p;
|
12243
13310
|
}
|
@@ -12251,7 +13318,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
12251
13318
|
uint32_t max_contig = 0;
|
12252
13319
|
int32_t max_contig_idx = -1;
|
12253
13320
|
|
12254
|
-
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->
|
13321
|
+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) {
|
12255
13322
|
const size_t curr_size = kv_cells[i].seq_id.size();
|
12256
13323
|
token_count += curr_size;
|
12257
13324
|
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
@@ -12268,7 +13335,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
12268
13335
|
|
12269
13336
|
int seq_idx = 0;
|
12270
13337
|
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
12271
|
-
if (seq_idx >= view->
|
13338
|
+
if (seq_idx >= view->n_seq_max) {
|
12272
13339
|
break;
|
12273
13340
|
}
|
12274
13341
|
cs_curr[seq_idx] = it;
|
@@ -12277,7 +13344,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
12277
13344
|
if (seq_idx != 0) {
|
12278
13345
|
used_cells++;
|
12279
13346
|
}
|
12280
|
-
for (; seq_idx < view->
|
13347
|
+
for (; seq_idx < view->n_seq_max; seq_idx++) {
|
12281
13348
|
cs_curr[seq_idx] = -1;
|
12282
13349
|
}
|
12283
13350
|
}
|
@@ -12313,8 +13380,8 @@ void llama_kv_cache_clear(struct llama_context * ctx) {
|
|
12313
13380
|
llama_kv_cache_clear(ctx->kv_self);
|
12314
13381
|
}
|
12315
13382
|
|
12316
|
-
|
12317
|
-
llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
13383
|
+
bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
13384
|
+
return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
12318
13385
|
}
|
12319
13386
|
|
12320
13387
|
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
@@ -12365,12 +13432,17 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
12365
13432
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
12366
13433
|
const size_t s_logits_size = sizeof(size_t);
|
12367
13434
|
// assume worst case for logits although only currently set ones are serialized
|
12368
|
-
const size_t s_logits = ctx->
|
13435
|
+
const size_t s_logits = ctx->logits_size * sizeof(float);
|
12369
13436
|
const size_t s_embedding_size = sizeof(size_t);
|
12370
|
-
const size_t s_embedding = ctx->
|
12371
|
-
const size_t
|
12372
|
-
const size_t
|
13437
|
+
const size_t s_embedding = ctx->embd_size * sizeof(float);
|
13438
|
+
const size_t s_kv_buf_size = sizeof(size_t);
|
13439
|
+
const size_t s_kv_head = sizeof(uint32_t);
|
13440
|
+
const size_t s_kv_size = sizeof(uint32_t);
|
13441
|
+
const size_t s_kv_used = sizeof(uint32_t);
|
12373
13442
|
const size_t s_kv = ctx->kv_self.total_size();
|
13443
|
+
// TODO: assume the max is more than 1 seq_id per KV cell
|
13444
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
13445
|
+
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
12374
13446
|
|
12375
13447
|
const size_t s_total = (
|
12376
13448
|
+ s_rng_size
|
@@ -12379,9 +13451,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
12379
13451
|
+ s_logits
|
12380
13452
|
+ s_embedding_size
|
12381
13453
|
+ s_embedding
|
13454
|
+
+ s_kv_buf_size
|
13455
|
+
+ s_kv_head
|
12382
13456
|
+ s_kv_size
|
12383
|
-
+
|
13457
|
+
+ s_kv_used
|
12384
13458
|
+ s_kv
|
13459
|
+
+ s_kv_cells
|
12385
13460
|
);
|
12386
13461
|
|
12387
13462
|
return s_total;
|
@@ -12457,23 +13532,23 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12457
13532
|
|
12458
13533
|
// copy logits
|
12459
13534
|
{
|
12460
|
-
const size_t logits_size = ctx->
|
13535
|
+
const size_t logits_size = ctx->logits_size;
|
12461
13536
|
|
12462
13537
|
data_ctx->write(&logits_size, sizeof(logits_size));
|
12463
13538
|
|
12464
13539
|
if (logits_size) {
|
12465
|
-
data_ctx->write(ctx->logits
|
13540
|
+
data_ctx->write(ctx->logits, logits_size * sizeof(float));
|
12466
13541
|
}
|
12467
13542
|
}
|
12468
13543
|
|
12469
13544
|
// copy embeddings
|
12470
13545
|
{
|
12471
|
-
const size_t
|
13546
|
+
const size_t embeddings_size = ctx->embd_size;
|
12472
13547
|
|
12473
|
-
data_ctx->write(&
|
13548
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
12474
13549
|
|
12475
|
-
if (
|
12476
|
-
data_ctx->write(ctx->
|
13550
|
+
if (embeddings_size) {
|
13551
|
+
data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
|
12477
13552
|
}
|
12478
13553
|
}
|
12479
13554
|
|
@@ -12481,15 +13556,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12481
13556
|
{
|
12482
13557
|
const auto & kv_self = ctx->kv_self;
|
12483
13558
|
const auto & hparams = ctx->model.hparams;
|
12484
|
-
const auto & cparams = ctx->cparams;
|
12485
13559
|
|
12486
13560
|
const uint32_t n_layer = hparams.n_layer;
|
12487
|
-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12488
|
-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12489
|
-
const uint32_t n_ctx = cparams.n_ctx;
|
13561
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13562
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
12490
13563
|
|
12491
13564
|
const size_t kv_buf_size = kv_self.total_size();
|
12492
|
-
const uint32_t kv_head = kv_self
|
13565
|
+
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
12493
13566
|
const uint32_t kv_size = kv_self.size;
|
12494
13567
|
const uint32_t kv_used = kv_self.used;
|
12495
13568
|
|
@@ -12507,9 +13580,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12507
13580
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
12508
13581
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
12509
13582
|
|
13583
|
+
if (kv_self.recurrent) {
|
13584
|
+
// v is contiguous for recurrent models
|
13585
|
+
// TODO: use other tensors for state models than k and v
|
13586
|
+
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
13587
|
+
|
13588
|
+
tmp_buf.resize(v_size);
|
13589
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
13590
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
13591
|
+
continue;
|
13592
|
+
}
|
13593
|
+
|
12510
13594
|
// v is not contiguous, copy row by row
|
12511
13595
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12512
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
13596
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
12513
13597
|
|
12514
13598
|
tmp_buf.resize(v_row_size);
|
12515
13599
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
@@ -12519,7 +13603,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12519
13603
|
}
|
12520
13604
|
}
|
12521
13605
|
|
12522
|
-
for (uint32_t i = 0; i <
|
13606
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
12523
13607
|
const auto & cell = kv_self.cells[i];
|
12524
13608
|
|
12525
13609
|
const llama_pos pos = cell.pos;
|
@@ -12567,27 +13651,25 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12567
13651
|
|
12568
13652
|
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
12569
13653
|
|
12570
|
-
GGML_ASSERT(ctx->
|
13654
|
+
GGML_ASSERT(ctx->logits_size >= logits_size);
|
12571
13655
|
|
12572
13656
|
if (logits_size) {
|
12573
|
-
ctx->logits
|
12574
|
-
|
12575
|
-
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
13657
|
+
memcpy(ctx->logits, inp, logits_size * sizeof(float));
|
12576
13658
|
inp += logits_size * sizeof(float);
|
12577
13659
|
}
|
12578
13660
|
}
|
12579
13661
|
|
12580
13662
|
// set embeddings
|
12581
13663
|
{
|
12582
|
-
size_t
|
13664
|
+
size_t embeddings_size;
|
12583
13665
|
|
12584
|
-
memcpy(&
|
13666
|
+
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
12585
13667
|
|
12586
|
-
GGML_ASSERT(ctx->
|
13668
|
+
GGML_ASSERT(ctx->embd_size == embeddings_size);
|
12587
13669
|
|
12588
|
-
if (
|
12589
|
-
memcpy(ctx->
|
12590
|
-
inp +=
|
13670
|
+
if (embeddings_size) {
|
13671
|
+
memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
|
13672
|
+
inp += embeddings_size * sizeof(float);
|
12591
13673
|
}
|
12592
13674
|
}
|
12593
13675
|
|
@@ -12595,12 +13677,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12595
13677
|
{
|
12596
13678
|
const auto & kv_self = ctx->kv_self;
|
12597
13679
|
const auto & hparams = ctx->model.hparams;
|
12598
|
-
const auto & cparams = ctx->cparams;
|
12599
13680
|
|
12600
13681
|
const uint32_t n_layer = hparams.n_layer;
|
12601
|
-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12602
|
-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12603
|
-
const uint32_t n_ctx = cparams.n_ctx;
|
13682
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13683
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
12604
13684
|
|
12605
13685
|
size_t kv_buf_size;
|
12606
13686
|
uint32_t kv_head;
|
@@ -12621,9 +13701,19 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12621
13701
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
12622
13702
|
inp += k_size;
|
12623
13703
|
|
13704
|
+
if (kv_self.recurrent) {
|
13705
|
+
// v is contiguous for recurrent models
|
13706
|
+
// TODO: use other tensors for state models than k and v
|
13707
|
+
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
13708
|
+
|
13709
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size);
|
13710
|
+
inp += v_size;
|
13711
|
+
continue;
|
13712
|
+
}
|
13713
|
+
|
12624
13714
|
// v is not contiguous, copy row by row
|
12625
13715
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12626
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
13716
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
12627
13717
|
|
12628
13718
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12629
13719
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
@@ -12632,13 +13722,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12632
13722
|
}
|
12633
13723
|
}
|
12634
13724
|
|
13725
|
+
GGML_ASSERT(kv_self.size == kv_size);
|
13726
|
+
|
12635
13727
|
ctx->kv_self.head = kv_head;
|
12636
13728
|
ctx->kv_self.size = kv_size;
|
12637
13729
|
ctx->kv_self.used = kv_used;
|
12638
13730
|
|
12639
13731
|
ctx->kv_self.cells.resize(kv_size);
|
12640
13732
|
|
12641
|
-
for (uint32_t i = 0; i <
|
13733
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
12642
13734
|
llama_pos pos;
|
12643
13735
|
size_t seq_id_size;
|
12644
13736
|
|
@@ -12654,6 +13746,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12654
13746
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
12655
13747
|
}
|
12656
13748
|
}
|
13749
|
+
|
13750
|
+
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
13751
|
+
ctx->kv_self.cells[i].pos = -1;
|
13752
|
+
ctx->kv_self.cells[i].seq_id.clear();
|
13753
|
+
}
|
12657
13754
|
}
|
12658
13755
|
|
12659
13756
|
const size_t nread = inp - src;
|
@@ -12751,6 +13848,15 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
12751
13848
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
12752
13849
|
}
|
12753
13850
|
|
13851
|
+
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
13852
|
+
ctx->abort_callback = abort_callback;
|
13853
|
+
ctx->abort_callback_data = abort_callback_data;
|
13854
|
+
}
|
13855
|
+
|
13856
|
+
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
13857
|
+
ctx->cparams.causal_attn = causal_attn;
|
13858
|
+
}
|
13859
|
+
|
12754
13860
|
struct llama_batch llama_batch_get_one(
|
12755
13861
|
llama_token * tokens,
|
12756
13862
|
int32_t n_tokens,
|
@@ -12817,32 +13923,81 @@ int32_t llama_decode(
|
|
12817
13923
|
return ret;
|
12818
13924
|
}
|
12819
13925
|
|
13926
|
+
void llama_synchronize(struct llama_context * ctx) {
|
13927
|
+
ggml_backend_sched_synchronize(ctx->sched);
|
13928
|
+
|
13929
|
+
// FIXME: if multiple single tokens are evaluated without a synchronization,
|
13930
|
+
// the stats will be added to the prompt evaluation stats
|
13931
|
+
// this should only happen when using batch size 1 to evaluate a batch
|
13932
|
+
|
13933
|
+
// add the evaluation to the stats
|
13934
|
+
if (ctx->n_queued_tokens == 1) {
|
13935
|
+
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
13936
|
+
ctx->n_eval++;
|
13937
|
+
} else if (ctx->n_queued_tokens > 1) {
|
13938
|
+
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
13939
|
+
ctx->n_p_eval += ctx->n_queued_tokens;
|
13940
|
+
}
|
13941
|
+
|
13942
|
+
// get a more accurate load time, upon first eval
|
13943
|
+
if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
|
13944
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
13945
|
+
ctx->has_evaluated_once = true;
|
13946
|
+
}
|
13947
|
+
|
13948
|
+
ctx->n_queued_tokens = 0;
|
13949
|
+
ctx->t_compute_start_us = 0;
|
13950
|
+
}
|
13951
|
+
|
12820
13952
|
float * llama_get_logits(struct llama_context * ctx) {
|
12821
|
-
|
13953
|
+
llama_synchronize(ctx);
|
13954
|
+
|
13955
|
+
return ctx->logits;
|
12822
13956
|
}
|
12823
13957
|
|
12824
13958
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
12825
13959
|
assert(ctx->logits_valid.at(i));
|
12826
|
-
|
13960
|
+
|
13961
|
+
llama_synchronize(ctx);
|
13962
|
+
|
13963
|
+
return ctx->logits + i*ctx->model.hparams.n_vocab;
|
12827
13964
|
}
|
12828
13965
|
|
12829
13966
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
12830
|
-
|
13967
|
+
llama_synchronize(ctx);
|
13968
|
+
|
13969
|
+
return ctx->embd;
|
12831
13970
|
}
|
12832
13971
|
|
12833
13972
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
12834
|
-
|
13973
|
+
llama_synchronize(ctx);
|
13974
|
+
|
13975
|
+
return ctx->embd + i*ctx->model.hparams.n_embd;
|
13976
|
+
}
|
13977
|
+
|
13978
|
+
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
13979
|
+
llama_synchronize(ctx);
|
13980
|
+
|
13981
|
+
auto it = ctx->embd_seq.find(seq_id);
|
13982
|
+
if (it == ctx->embd_seq.end()) {
|
13983
|
+
return nullptr;
|
13984
|
+
}
|
13985
|
+
|
13986
|
+
return it->second.data();
|
12835
13987
|
}
|
12836
13988
|
|
12837
13989
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
13990
|
+
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12838
13991
|
return model->vocab.id_to_token[token].text.c_str();
|
12839
13992
|
}
|
12840
13993
|
|
12841
13994
|
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
13995
|
+
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12842
13996
|
return model->vocab.id_to_token[token].score;
|
12843
13997
|
}
|
12844
13998
|
|
12845
13999
|
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
14000
|
+
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12846
14001
|
return model->vocab.id_to_token[token].type;
|
12847
14002
|
}
|
12848
14003
|
|
@@ -12887,12 +14042,12 @@ int32_t llama_tokenize(
|
|
12887
14042
|
const char * text,
|
12888
14043
|
int32_t text_len,
|
12889
14044
|
llama_token * tokens,
|
12890
|
-
int32_t
|
14045
|
+
int32_t n_tokens_max,
|
12891
14046
|
bool add_bos,
|
12892
14047
|
bool special) {
|
12893
14048
|
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
12894
14049
|
|
12895
|
-
if (
|
14050
|
+
if (n_tokens_max < (int) res.size()) {
|
12896
14051
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
12897
14052
|
return -((int) res.size());
|
12898
14053
|
}
|
@@ -12906,9 +14061,9 @@ int32_t llama_tokenize(
|
|
12906
14061
|
|
12907
14062
|
static std::string llama_decode_text(const std::string & text) {
|
12908
14063
|
std::string decoded_text;
|
12909
|
-
auto unicode_sequences =
|
12910
|
-
for (auto& unicode_sequence : unicode_sequences) {
|
12911
|
-
decoded_text +=
|
14064
|
+
auto unicode_sequences = unicode_cpts_from_utf8(text);
|
14065
|
+
for (auto & unicode_sequence : unicode_sequences) {
|
14066
|
+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
|
12912
14067
|
}
|
12913
14068
|
|
12914
14069
|
return decoded_text;
|
@@ -12933,7 +14088,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
12933
14088
|
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
12934
14089
|
std::string result = model->vocab.id_to_token[token].text;
|
12935
14090
|
if (length < (int) result.length()) {
|
12936
|
-
return -result.length();
|
14091
|
+
return -(int) result.length();
|
12937
14092
|
}
|
12938
14093
|
memcpy(buf, result.c_str(), result.length());
|
12939
14094
|
return result.length();
|
@@ -12968,7 +14123,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
12968
14123
|
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
12969
14124
|
std::string result = model->vocab.id_to_token[token].text;
|
12970
14125
|
if (length < (int) result.length()) {
|
12971
|
-
return -result.length();
|
14126
|
+
return -(int) result.length();
|
12972
14127
|
}
|
12973
14128
|
memcpy(buf, result.c_str(), result.length());
|
12974
14129
|
return result.length();
|
@@ -13005,7 +14160,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13005
14160
|
std::string & dest, bool add_ass) {
|
13006
14161
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
13007
14162
|
std::stringstream ss;
|
13008
|
-
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
14163
|
+
if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
|
13009
14164
|
// chatml template
|
13010
14165
|
for (auto message : chat) {
|
13011
14166
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
@@ -13013,7 +14168,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13013
14168
|
if (add_ass) {
|
13014
14169
|
ss << "<|im_start|>assistant\n";
|
13015
14170
|
}
|
13016
|
-
} else if (tmpl.find("[INST]") != std::string::npos) {
|
14171
|
+
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
|
13017
14172
|
// llama2 template and its variants
|
13018
14173
|
// [variant] support system message
|
13019
14174
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
@@ -13048,7 +14203,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13048
14203
|
}
|
13049
14204
|
}
|
13050
14205
|
// llama2 templates seem to not care about "add_generation_prompt"
|
13051
|
-
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
14206
|
+
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
13052
14207
|
// zephyr template
|
13053
14208
|
for (auto message : chat) {
|
13054
14209
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
@@ -13056,7 +14211,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13056
14211
|
if (add_ass) {
|
13057
14212
|
ss << "<|assistant|>\n";
|
13058
14213
|
}
|
13059
|
-
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
14214
|
+
} else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
|
13060
14215
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
13061
14216
|
for (auto message : chat) {
|
13062
14217
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
@@ -13065,7 +14220,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13065
14220
|
if (add_ass) {
|
13066
14221
|
ss << "<s>assistant\n";
|
13067
14222
|
}
|
13068
|
-
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
14223
|
+
} else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
13069
14224
|
// google/gemma-7b-it
|
13070
14225
|
std::string system_prompt = "";
|
13071
14226
|
for (auto message : chat) {
|
@@ -13087,6 +14242,26 @@ static int32_t llama_chat_apply_template_internal(
|
|
13087
14242
|
if (add_ass) {
|
13088
14243
|
ss << "<start_of_turn>model\n";
|
13089
14244
|
}
|
14245
|
+
} else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
|
14246
|
+
// OrionStarAI/Orion-14B-Chat
|
14247
|
+
std::string system_prompt = "";
|
14248
|
+
for (auto message : chat) {
|
14249
|
+
std::string role(message->role);
|
14250
|
+
if (role == "system") {
|
14251
|
+
// there is no system message support, we will merge it with user prompt
|
14252
|
+
system_prompt = message->content;
|
14253
|
+
continue;
|
14254
|
+
} else if (role == "user") {
|
14255
|
+
ss << "Human: ";
|
14256
|
+
if (!system_prompt.empty()) {
|
14257
|
+
ss << system_prompt << "\n\n";
|
14258
|
+
system_prompt = "";
|
14259
|
+
}
|
14260
|
+
ss << message->content << "\n\nAssistant: </s>";
|
14261
|
+
} else {
|
14262
|
+
ss << message->content << "</s>";
|
14263
|
+
}
|
14264
|
+
}
|
13090
14265
|
} else {
|
13091
14266
|
// template not supported
|
13092
14267
|
return -1;
|
@@ -13112,23 +14287,27 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
13112
14287
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
13113
14288
|
if (res < 0) {
|
13114
14289
|
// worst case: there is no information about template, we will use chatml by default
|
13115
|
-
curr_tmpl = "
|
14290
|
+
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
13116
14291
|
} else {
|
13117
14292
|
curr_tmpl = std::string(model_template.data(), model_template.size());
|
13118
14293
|
}
|
13119
14294
|
}
|
14295
|
+
|
13120
14296
|
// format the chat to string
|
13121
14297
|
std::vector<const llama_chat_message *> chat_vec;
|
13122
14298
|
chat_vec.resize(n_msg);
|
13123
14299
|
for (size_t i = 0; i < n_msg; i++) {
|
13124
14300
|
chat_vec[i] = &chat[i];
|
13125
14301
|
}
|
14302
|
+
|
13126
14303
|
std::string formatted_chat;
|
13127
14304
|
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
13128
14305
|
if (res < 0) {
|
13129
14306
|
return res;
|
13130
14307
|
}
|
13131
|
-
|
14308
|
+
if (buf && length > 0) {
|
14309
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
14310
|
+
}
|
13132
14311
|
return res;
|
13133
14312
|
}
|
13134
14313
|
|