llama_cpp 0.13.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/ext/llama_cpp/llama_cpp.cpp +130 -26
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -4
- data/vendor/tmp/llama.cpp/Makefile +30 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
- data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
- data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
- data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
- data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
- data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
- data/vendor/tmp/llama.cpp/ggml.c +734 -356
- data/vendor/tmp/llama.cpp/ggml.h +91 -51
- data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
- data/vendor/tmp/llama.cpp/llama.h +53 -21
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
@@ -104,6 +104,7 @@
|
|
104
104
|
#define LLAMA_MAX_NODES 8192
|
105
105
|
#define LLAMA_MAX_EXPERTS 8
|
106
106
|
|
107
|
+
|
107
108
|
//
|
108
109
|
// logging
|
109
110
|
//
|
@@ -211,10 +212,12 @@ enum llm_arch {
|
|
211
212
|
LLM_ARCH_INTERNLM2,
|
212
213
|
LLM_ARCH_MINICPM,
|
213
214
|
LLM_ARCH_GEMMA,
|
215
|
+
LLM_ARCH_STARCODER2,
|
216
|
+
LLM_ARCH_MAMBA,
|
214
217
|
LLM_ARCH_UNKNOWN,
|
215
218
|
};
|
216
219
|
|
217
|
-
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
220
|
+
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
218
221
|
{ LLM_ARCH_LLAMA, "llama" },
|
219
222
|
{ LLM_ARCH_FALCON, "falcon" },
|
220
223
|
{ LLM_ARCH_GPT2, "gpt2" },
|
@@ -238,6 +241,9 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
238
241
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
239
242
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
240
243
|
{ LLM_ARCH_GEMMA, "gemma" },
|
244
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
245
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
246
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
241
247
|
};
|
242
248
|
|
243
249
|
enum llm_kv {
|
@@ -252,6 +258,7 @@ enum llm_kv {
|
|
252
258
|
LLM_KV_GENERAL_SOURCE_URL,
|
253
259
|
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
254
260
|
|
261
|
+
LLM_KV_VOCAB_SIZE,
|
255
262
|
LLM_KV_CONTEXT_LENGTH,
|
256
263
|
LLM_KV_EMBEDDING_LENGTH,
|
257
264
|
LLM_KV_BLOCK_COUNT,
|
@@ -280,6 +287,11 @@ enum llm_kv {
|
|
280
287
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
281
288
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
282
289
|
|
290
|
+
LLM_KV_SSM_INNER_SIZE,
|
291
|
+
LLM_KV_SSM_CONV_KERNEL,
|
292
|
+
LLM_KV_SSM_STATE_SIZE,
|
293
|
+
LLM_KV_SSM_TIME_STEP_RANK,
|
294
|
+
|
283
295
|
LLM_KV_TOKENIZER_MODEL,
|
284
296
|
LLM_KV_TOKENIZER_LIST,
|
285
297
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
@@ -298,7 +310,7 @@ enum llm_kv {
|
|
298
310
|
LLM_KV_TOKENIZER_RWKV,
|
299
311
|
};
|
300
312
|
|
301
|
-
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
313
|
+
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
302
314
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
303
315
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
304
316
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
@@ -310,6 +322,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
310
322
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
311
323
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
312
324
|
|
325
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
313
326
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
314
327
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
315
328
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
@@ -338,6 +351,11 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
338
351
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
339
352
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
340
353
|
|
354
|
+
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
355
|
+
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
356
|
+
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
357
|
+
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
358
|
+
|
341
359
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
342
360
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
343
361
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
@@ -362,7 +380,7 @@ struct LLM_KV {
|
|
362
380
|
llm_arch arch;
|
363
381
|
|
364
382
|
std::string operator()(llm_kv kv) const {
|
365
|
-
return ::format(LLM_KV_NAMES
|
383
|
+
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
366
384
|
}
|
367
385
|
};
|
368
386
|
|
@@ -395,9 +413,16 @@ enum llm_tensor {
|
|
395
413
|
LLM_TENSOR_ATTN_Q_NORM,
|
396
414
|
LLM_TENSOR_ATTN_K_NORM,
|
397
415
|
LLM_TENSOR_LAYER_OUT_NORM,
|
416
|
+
LLM_TENSOR_SSM_IN,
|
417
|
+
LLM_TENSOR_SSM_CONV1D,
|
418
|
+
LLM_TENSOR_SSM_X,
|
419
|
+
LLM_TENSOR_SSM_DT,
|
420
|
+
LLM_TENSOR_SSM_A,
|
421
|
+
LLM_TENSOR_SSM_D,
|
422
|
+
LLM_TENSOR_SSM_OUT,
|
398
423
|
};
|
399
424
|
|
400
|
-
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
425
|
+
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
401
426
|
{
|
402
427
|
LLM_ARCH_LLAMA,
|
403
428
|
{
|
@@ -779,6 +804,40 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
779
804
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
780
805
|
},
|
781
806
|
},
|
807
|
+
{
|
808
|
+
LLM_ARCH_STARCODER2,
|
809
|
+
{
|
810
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
811
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
812
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
813
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
814
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
815
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
816
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
817
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
818
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
819
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
821
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
822
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
823
|
+
},
|
824
|
+
},
|
825
|
+
{
|
826
|
+
LLM_ARCH_MAMBA,
|
827
|
+
{
|
828
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
829
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
830
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
831
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
832
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
833
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
834
|
+
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
|
835
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
836
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
837
|
+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
838
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
839
|
+
},
|
840
|
+
},
|
782
841
|
{
|
783
842
|
LLM_ARCH_UNKNOWN,
|
784
843
|
{
|
@@ -812,38 +871,38 @@ struct LLM_TN {
|
|
812
871
|
llm_arch arch;
|
813
872
|
|
814
873
|
std::string operator()(llm_tensor tensor) const {
|
815
|
-
if (LLM_TENSOR_NAMES
|
874
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
816
875
|
return "__missing__";
|
817
876
|
}
|
818
|
-
return LLM_TENSOR_NAMES
|
877
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
819
878
|
}
|
820
879
|
|
821
880
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
822
|
-
if (LLM_TENSOR_NAMES
|
881
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
823
882
|
return "__missing__";
|
824
883
|
}
|
825
|
-
return LLM_TENSOR_NAMES
|
884
|
+
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
826
885
|
}
|
827
886
|
|
828
887
|
std::string operator()(llm_tensor tensor, int bid) const {
|
829
|
-
if (LLM_TENSOR_NAMES
|
888
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
830
889
|
return "__missing__";
|
831
890
|
}
|
832
|
-
return ::format(LLM_TENSOR_NAMES
|
891
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
|
833
892
|
}
|
834
893
|
|
835
894
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
836
|
-
if (LLM_TENSOR_NAMES
|
895
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
837
896
|
return "__missing__";
|
838
897
|
}
|
839
|
-
return ::format(LLM_TENSOR_NAMES
|
898
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
|
840
899
|
}
|
841
900
|
|
842
901
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
843
|
-
if (LLM_TENSOR_NAMES
|
902
|
+
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
844
903
|
return "__missing__";
|
845
904
|
}
|
846
|
-
return ::format(LLM_TENSOR_NAMES
|
905
|
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
|
847
906
|
}
|
848
907
|
};
|
849
908
|
|
@@ -851,16 +910,16 @@ struct LLM_TN {
|
|
851
910
|
// gguf helpers
|
852
911
|
//
|
853
912
|
|
854
|
-
static std::map<
|
913
|
+
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
855
914
|
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
856
915
|
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
857
916
|
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
858
917
|
};
|
859
918
|
|
860
|
-
static
|
919
|
+
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
861
920
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
862
921
|
if (kv.second == name) {
|
863
|
-
return kv.first;
|
922
|
+
return (llama_rope_scaling_type) kv.first;
|
864
923
|
}
|
865
924
|
}
|
866
925
|
|
@@ -921,21 +980,6 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
921
980
|
}
|
922
981
|
}
|
923
982
|
|
924
|
-
//
|
925
|
-
// ggml helpers
|
926
|
-
//
|
927
|
-
|
928
|
-
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
929
|
-
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
930
|
-
|
931
|
-
if (plan.work_size > 0) {
|
932
|
-
buf.resize(plan.work_size);
|
933
|
-
plan.work_data = buf.data();
|
934
|
-
}
|
935
|
-
|
936
|
-
ggml_graph_compute(graph, &plan);
|
937
|
-
}
|
938
|
-
|
939
983
|
//
|
940
984
|
// llama helpers
|
941
985
|
//
|
@@ -1409,7 +1453,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
1409
1453
|
buft = ggml_backend_cuda_host_buffer_type();
|
1410
1454
|
}
|
1411
1455
|
#elif defined(GGML_USE_SYCL)
|
1412
|
-
|
1456
|
+
if (host_buffer) {
|
1457
|
+
buft = ggml_backend_sycl_host_buffer_type();
|
1458
|
+
}
|
1413
1459
|
#elif defined(GGML_USE_CPU_HBM)
|
1414
1460
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
1415
1461
|
#elif defined(GGML_USE_VULKAN)
|
@@ -1463,6 +1509,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1463
1509
|
}
|
1464
1510
|
#endif
|
1465
1511
|
|
1512
|
+
#ifdef GGML_USE_SYCL
|
1513
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
1514
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
1515
|
+
}
|
1516
|
+
#endif
|
1517
|
+
|
1466
1518
|
if (buft == nullptr) {
|
1467
1519
|
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1468
1520
|
}
|
@@ -1474,6 +1526,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
1474
1526
|
static size_t llama_get_device_count() {
|
1475
1527
|
#if defined(GGML_USE_CUBLAS)
|
1476
1528
|
return ggml_backend_cuda_get_device_count();
|
1529
|
+
#elif defined(GGML_USE_SYCL)
|
1530
|
+
return ggml_backend_sycl_get_device_count();
|
1477
1531
|
#elif defined(GGML_USE_VULKAN)
|
1478
1532
|
return ggml_backend_vk_get_device_count();
|
1479
1533
|
#else
|
@@ -1487,6 +1541,11 @@ static size_t llama_get_device_memory(int device) {
|
|
1487
1541
|
size_t free;
|
1488
1542
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
1489
1543
|
return free;
|
1544
|
+
#elif defined(GGML_USE_SYCL)
|
1545
|
+
size_t total;
|
1546
|
+
size_t free;
|
1547
|
+
ggml_backend_sycl_get_device_memory(device, &total, &free);
|
1548
|
+
return free;
|
1490
1549
|
#elif defined(GGML_USE_VULKAN)
|
1491
1550
|
size_t total;
|
1492
1551
|
size_t free;
|
@@ -1575,7 +1634,12 @@ struct llama_hparams {
|
|
1575
1634
|
float rope_freq_base_train;
|
1576
1635
|
float rope_freq_scale_train;
|
1577
1636
|
uint32_t n_yarn_orig_ctx;
|
1578
|
-
|
1637
|
+
|
1638
|
+
// for State Space Models
|
1639
|
+
uint32_t ssm_d_conv = 0;
|
1640
|
+
uint32_t ssm_d_inner = 0;
|
1641
|
+
uint32_t ssm_d_state = 0;
|
1642
|
+
uint32_t ssm_dt_rank = 0;
|
1579
1643
|
|
1580
1644
|
float f_clamp_kqv = 0.0f;
|
1581
1645
|
float f_max_alibi_bias = 0.0f;
|
@@ -1583,8 +1647,9 @@ struct llama_hparams {
|
|
1583
1647
|
bool causal_attn = true;
|
1584
1648
|
bool need_kq_pos = false;
|
1585
1649
|
|
1586
|
-
enum llama_pooling_type
|
1587
|
-
enum llama_rope_type
|
1650
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1651
|
+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
1652
|
+
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
1588
1653
|
|
1589
1654
|
bool operator!=(const llama_hparams & other) const {
|
1590
1655
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1604,6 +1669,11 @@ struct llama_hparams {
|
|
1604
1669
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1605
1670
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1606
1671
|
|
1672
|
+
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
1673
|
+
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
1674
|
+
if (this->ssm_d_state != other.ssm_d_state) return true;
|
1675
|
+
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
1676
|
+
|
1607
1677
|
const float EPSILON = 1e-9f;
|
1608
1678
|
|
1609
1679
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
@@ -1615,6 +1685,9 @@ struct llama_hparams {
|
|
1615
1685
|
}
|
1616
1686
|
|
1617
1687
|
uint32_t n_gqa() const {
|
1688
|
+
if (n_head_kv == 0) {
|
1689
|
+
return 0;
|
1690
|
+
}
|
1618
1691
|
return n_head/n_head_kv;
|
1619
1692
|
}
|
1620
1693
|
|
@@ -1625,16 +1698,29 @@ struct llama_hparams {
|
|
1625
1698
|
uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
|
1626
1699
|
return n_embd_head_v * n_head_kv;
|
1627
1700
|
}
|
1701
|
+
|
1702
|
+
uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
|
1703
|
+
// corresponds to Mamba's conv_states size
|
1704
|
+
// TODO: maybe support other convolution strides than 1
|
1705
|
+
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
1706
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
1710
|
+
// corresponds to Mamba's ssm_states size
|
1711
|
+
return ssm_d_state * ssm_d_inner;
|
1712
|
+
}
|
1628
1713
|
};
|
1629
1714
|
|
1630
1715
|
struct llama_cparams {
|
1631
|
-
uint32_t n_ctx;
|
1716
|
+
uint32_t n_ctx; // context size used during inference
|
1632
1717
|
uint32_t n_batch;
|
1718
|
+
uint32_t n_ubatch;
|
1633
1719
|
uint32_t n_threads; // number of threads to use for generation
|
1634
1720
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1635
1721
|
|
1636
|
-
float
|
1637
|
-
float
|
1722
|
+
float rope_freq_base;
|
1723
|
+
float rope_freq_scale;
|
1638
1724
|
|
1639
1725
|
uint32_t n_yarn_orig_ctx;
|
1640
1726
|
// These hyperparameters are not exposed in GGUF, because all
|
@@ -1645,8 +1731,11 @@ struct llama_cparams {
|
|
1645
1731
|
float yarn_beta_slow;
|
1646
1732
|
float defrag_thold;
|
1647
1733
|
|
1734
|
+
bool embeddings;
|
1735
|
+
bool causal_attn;
|
1648
1736
|
bool offload_kqv;
|
1649
|
-
|
1737
|
+
|
1738
|
+
enum llama_pooling_type pooling_type;
|
1650
1739
|
|
1651
1740
|
ggml_backend_sched_eval_callback cb_eval;
|
1652
1741
|
void * cb_eval_user_data;
|
@@ -1700,11 +1789,27 @@ struct llama_layer {
|
|
1700
1789
|
struct ggml_tensor * ffn_down_b; // b2
|
1701
1790
|
struct ggml_tensor * ffn_up_b; // b3
|
1702
1791
|
struct ggml_tensor * ffn_act;
|
1792
|
+
|
1793
|
+
// mamba proj
|
1794
|
+
struct ggml_tensor * ssm_in;
|
1795
|
+
struct ggml_tensor * ssm_x;
|
1796
|
+
struct ggml_tensor * ssm_dt;
|
1797
|
+
struct ggml_tensor * ssm_out;
|
1798
|
+
|
1799
|
+
// mamba
|
1800
|
+
struct ggml_tensor * ssm_conv1d;
|
1801
|
+
struct ggml_tensor * ssm_a;
|
1802
|
+
struct ggml_tensor * ssm_d;
|
1803
|
+
|
1804
|
+
// mamba bias
|
1805
|
+
struct ggml_tensor * ssm_conv1d_b;
|
1806
|
+
struct ggml_tensor * ssm_dt_b;
|
1703
1807
|
};
|
1704
1808
|
|
1705
1809
|
struct llama_kv_cell {
|
1706
1810
|
llama_pos pos = -1;
|
1707
1811
|
llama_pos delta = 0;
|
1812
|
+
int32_t src = 0; // used by recurrent state models to copy states
|
1708
1813
|
|
1709
1814
|
std::set<llama_seq_id> seq_id;
|
1710
1815
|
|
@@ -1725,6 +1830,9 @@ struct llama_kv_cell {
|
|
1725
1830
|
struct llama_kv_cache {
|
1726
1831
|
bool has_shift = false;
|
1727
1832
|
bool do_defrag = false;
|
1833
|
+
bool do_copy = false;
|
1834
|
+
// with recurrent state models, a cell can hold the state for more than one past token
|
1835
|
+
bool recurrent = false;
|
1728
1836
|
|
1729
1837
|
// Note: The value of head isn't only used to optimize searching
|
1730
1838
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -1904,8 +2012,7 @@ struct llama_context {
|
|
1904
2012
|
ggml_vk_free_cpu_assist();
|
1905
2013
|
#endif
|
1906
2014
|
|
1907
|
-
ggml_backend_buffer_free(
|
1908
|
-
ggml_free(ctx_input);
|
2015
|
+
ggml_backend_buffer_free(buf_output);
|
1909
2016
|
}
|
1910
2017
|
|
1911
2018
|
llama_cparams cparams;
|
@@ -1931,36 +2038,54 @@ struct llama_context {
|
|
1931
2038
|
int64_t t_p_eval_us = 0;
|
1932
2039
|
int64_t t_eval_us = 0;
|
1933
2040
|
|
2041
|
+
int64_t t_compute_start_us = 0;
|
2042
|
+
int64_t n_queued_tokens = 0;
|
2043
|
+
|
1934
2044
|
int32_t n_sample = 0; // number of tokens sampled
|
1935
2045
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1936
2046
|
int32_t n_eval = 0; // number of eval calls
|
1937
2047
|
|
2048
|
+
// host buffer for the model output (logits and embeddings)
|
2049
|
+
ggml_backend_buffer_t buf_output = nullptr;
|
2050
|
+
|
1938
2051
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1939
|
-
|
2052
|
+
size_t logits_size = 0;
|
2053
|
+
float * logits = nullptr;
|
2054
|
+
|
1940
2055
|
#ifndef NDEBUG
|
1941
2056
|
// guard against access to unset logits
|
1942
2057
|
std::vector<bool> logits_valid;
|
1943
2058
|
#endif
|
1944
2059
|
bool logits_all = false;
|
1945
2060
|
|
1946
|
-
//
|
1947
|
-
|
2061
|
+
// embeddings output (2-dimensional array: [n_tokens][n_embd])
|
2062
|
+
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
2063
|
+
size_t embd_size = 0;
|
2064
|
+
float * embd = nullptr;
|
2065
|
+
|
2066
|
+
// sequence embeddings output (map of [n_embd] vectors)
|
2067
|
+
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
2068
|
+
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
1948
2069
|
|
1949
2070
|
// memory buffers used to evaluate the model
|
1950
2071
|
std::vector<uint8_t> buf_compute_meta;
|
1951
2072
|
ggml_backend_sched_t sched = nullptr;
|
1952
2073
|
|
2074
|
+
ggml_abort_callback abort_callback = nullptr;
|
2075
|
+
void * abort_callback_data = nullptr;
|
2076
|
+
|
1953
2077
|
// input tensors
|
1954
|
-
ggml_backend_buffer_t buf_input = nullptr;
|
1955
|
-
ggml_context * ctx_input = nullptr;
|
1956
2078
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
1957
2079
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
1958
2080
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
1959
|
-
struct ggml_tensor * inp_KQ_mask; // F32 [
|
1960
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [
|
1961
|
-
struct ggml_tensor * inp_K_shift; // I32 [
|
2081
|
+
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
2082
|
+
struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
|
2083
|
+
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
1962
2084
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
1963
2085
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
2086
|
+
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
2087
|
+
struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
|
2088
|
+
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
|
1964
2089
|
|
1965
2090
|
#ifdef GGML_USE_MPI
|
1966
2091
|
ggml_mpi_context * ctx_mpi = NULL;
|
@@ -1976,25 +2101,42 @@ static bool llama_kv_cache_init(
|
|
1976
2101
|
const llama_model & model,
|
1977
2102
|
ggml_type type_k,
|
1978
2103
|
ggml_type type_v,
|
1979
|
-
uint32_t
|
2104
|
+
uint32_t kv_size,
|
1980
2105
|
bool offload) {
|
1981
2106
|
const struct llama_hparams & hparams = model.hparams;
|
1982
2107
|
|
1983
|
-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
1984
|
-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
2108
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
2109
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
1985
2110
|
const int64_t n_layer = hparams.n_layer;
|
1986
2111
|
|
1987
2112
|
cache.has_shift = false;
|
1988
2113
|
|
2114
|
+
// TODO: find a nicer way to add other recurrent model architectures
|
2115
|
+
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
|
2116
|
+
|
2117
|
+
// TODO: support mixed reccurent Transformer architectues
|
2118
|
+
// NOTE: (!a || b) is a logical implication (a -> b)
|
2119
|
+
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
|
2120
|
+
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
|
2121
|
+
GGML_ASSERT( cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_gqa());
|
2122
|
+
GGML_ASSERT( cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_gqa());
|
2123
|
+
|
1989
2124
|
cache.head = 0;
|
1990
|
-
cache.size =
|
2125
|
+
cache.size = kv_size;
|
1991
2126
|
cache.used = 0;
|
1992
2127
|
|
1993
2128
|
cache.type_k = type_k;
|
1994
2129
|
cache.type_v = type_v;
|
1995
2130
|
|
1996
2131
|
cache.cells.clear();
|
1997
|
-
cache.cells.resize(
|
2132
|
+
cache.cells.resize(kv_size);
|
2133
|
+
|
2134
|
+
if (cache.recurrent) {
|
2135
|
+
// init state copy sources
|
2136
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
2137
|
+
cache.cells[i].src = i;
|
2138
|
+
}
|
2139
|
+
}
|
1998
2140
|
|
1999
2141
|
#ifdef GGML_USE_CLBLAST
|
2000
2142
|
offload = false;
|
@@ -2033,8 +2175,8 @@ static bool llama_kv_cache_init(
|
|
2033
2175
|
|
2034
2176
|
for (int i = 0; i < (int) n_layer; i++) {
|
2035
2177
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
2036
|
-
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*
|
2037
|
-
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*
|
2178
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
2179
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
2038
2180
|
ggml_format_name(k, "cache_k_l%d", i);
|
2039
2181
|
ggml_format_name(v, "cache_v_l%d", i);
|
2040
2182
|
cache.k_l.push_back(k);
|
@@ -2068,6 +2210,54 @@ static bool llama_kv_cache_find_slot(
|
|
2068
2210
|
const uint32_t n_ctx = cache.size;
|
2069
2211
|
const uint32_t n_tokens = batch.n_tokens;
|
2070
2212
|
|
2213
|
+
if (cache.recurrent) {
|
2214
|
+
// For recurrent state architectures (like Mamba),
|
2215
|
+
// each KV cache cell can store the state for a whole sequence.
|
2216
|
+
|
2217
|
+
llama_seq_id min = cache.size - 1;
|
2218
|
+
llama_seq_id max = 0;
|
2219
|
+
|
2220
|
+
for (uint32_t i = 0; i < n_tokens; ++i) {
|
2221
|
+
for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) {
|
2222
|
+
llama_seq_id seq_id = batch.seq_id[i][j];
|
2223
|
+
// make sure it's a valid seq_id
|
2224
|
+
if ((uint32_t) seq_id < cache.size) {
|
2225
|
+
if (seq_id > max) {
|
2226
|
+
max = seq_id;
|
2227
|
+
}
|
2228
|
+
if (seq_id < min) {
|
2229
|
+
min = seq_id;
|
2230
|
+
}
|
2231
|
+
// Assuming the tokens are in-order
|
2232
|
+
if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
|
2233
|
+
// What should happen when the pos backtracks or skips a value?
|
2234
|
+
// Clearing the state mid-batch would require special-casing which isn't done.
|
2235
|
+
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
|
2236
|
+
__func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
|
2237
|
+
}
|
2238
|
+
if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) {
|
2239
|
+
cache.used += 1;
|
2240
|
+
}
|
2241
|
+
cache.cells[seq_id].pos = batch.pos[i];
|
2242
|
+
// NOTE: seq_ids are not inserted here; they are handled when the input tensors are set
|
2243
|
+
} else {
|
2244
|
+
// too big seq_id
|
2245
|
+
// TODO: would it be possible to resize the KV cache size instead?
|
2246
|
+
LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
|
2247
|
+
return false;
|
2248
|
+
}
|
2249
|
+
}
|
2250
|
+
}
|
2251
|
+
|
2252
|
+
// allow getting the range of used cells, from head to head + n
|
2253
|
+
cache.head = min;
|
2254
|
+
cache.n = max - min + 1;
|
2255
|
+
|
2256
|
+
// sanity check
|
2257
|
+
return max >= min;
|
2258
|
+
}
|
2259
|
+
// otherwise, one cell per token.
|
2260
|
+
|
2071
2261
|
if (n_tokens > n_ctx) {
|
2072
2262
|
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
2073
2263
|
return false;
|
@@ -2116,10 +2306,12 @@ static bool llama_kv_cache_find_slot(
|
|
2116
2306
|
}
|
2117
2307
|
|
2118
2308
|
// find how many cells are currently in use
|
2119
|
-
static
|
2120
|
-
for (uint32_t i = cache.size
|
2121
|
-
|
2122
|
-
|
2309
|
+
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
2310
|
+
for (uint32_t i = cache.size; i > 0; --i) {
|
2311
|
+
const llama_kv_cell & cell = cache.cells[i - 1];
|
2312
|
+
|
2313
|
+
if (cell.pos >= 0 && !cell.is_empty()) {
|
2314
|
+
return i;
|
2123
2315
|
}
|
2124
2316
|
}
|
2125
2317
|
|
@@ -2135,7 +2327,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
|
2135
2327
|
cache.used = 0;
|
2136
2328
|
}
|
2137
2329
|
|
2138
|
-
static
|
2330
|
+
static bool llama_kv_cache_seq_rm(
|
2139
2331
|
struct llama_kv_cache & cache,
|
2140
2332
|
llama_seq_id seq_id,
|
2141
2333
|
llama_pos p0,
|
@@ -2145,6 +2337,25 @@ static void llama_kv_cache_seq_rm(
|
|
2145
2337
|
if (p0 < 0) p0 = 0;
|
2146
2338
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
2147
2339
|
|
2340
|
+
// models like Mamba can't have a state partially erased
|
2341
|
+
if (cache.recurrent) {
|
2342
|
+
if (seq_id >= (int64_t) cache.size) {
|
2343
|
+
// could be fatal
|
2344
|
+
return false;
|
2345
|
+
}
|
2346
|
+
if (0 <= seq_id) {
|
2347
|
+
// partial intersection is invalid
|
2348
|
+
if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) {
|
2349
|
+
return false;
|
2350
|
+
}
|
2351
|
+
} else {
|
2352
|
+
// seq_id is negative, then the range should include everything or nothing
|
2353
|
+
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
|
2354
|
+
return false;
|
2355
|
+
}
|
2356
|
+
}
|
2357
|
+
}
|
2358
|
+
|
2148
2359
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
2149
2360
|
if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
2150
2361
|
if (seq_id < 0) {
|
@@ -2166,6 +2377,8 @@ static void llama_kv_cache_seq_rm(
|
|
2166
2377
|
|
2167
2378
|
// If we freed up a slot, set head to it so searching can start there.
|
2168
2379
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
2380
|
+
|
2381
|
+
return true;
|
2169
2382
|
}
|
2170
2383
|
|
2171
2384
|
static void llama_kv_cache_seq_cp(
|
@@ -2177,6 +2390,29 @@ static void llama_kv_cache_seq_cp(
|
|
2177
2390
|
if (p0 < 0) p0 = 0;
|
2178
2391
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
2179
2392
|
|
2393
|
+
if (cache.recurrent) {
|
2394
|
+
if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
|
2395
|
+
seq_id_src = cache.cells[seq_id_src].src;
|
2396
|
+
GGML_ASSERT((uint32_t) seq_id_src < cache.size);
|
2397
|
+
// intent to "copy from"
|
2398
|
+
// supports copy chains thanks to taking the source of the source
|
2399
|
+
cache.cells[seq_id_dst].src = seq_id_src;
|
2400
|
+
|
2401
|
+
// preserve the "keep or clear" status of the copied sequence
|
2402
|
+
if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) {
|
2403
|
+
cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
|
2404
|
+
} else {
|
2405
|
+
cache.cells[seq_id_dst].seq_id.erase(seq_id_dst);
|
2406
|
+
}
|
2407
|
+
|
2408
|
+
cache.do_copy = true;
|
2409
|
+
|
2410
|
+
cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
|
2411
|
+
}
|
2412
|
+
return;
|
2413
|
+
}
|
2414
|
+
// otherwise, this is the KV cache of a Transformer-like model
|
2415
|
+
|
2180
2416
|
cache.head = 0;
|
2181
2417
|
|
2182
2418
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
@@ -2216,6 +2452,17 @@ static void llama_kv_cache_seq_add(
|
|
2216
2452
|
if (p0 < 0) p0 = 0;
|
2217
2453
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
2218
2454
|
|
2455
|
+
if (cache.recurrent) {
|
2456
|
+
// for Mamba-like models, only the pos needs to be shifted
|
2457
|
+
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
2458
|
+
llama_kv_cell & cell = cache.cells[seq_id];
|
2459
|
+
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
|
2460
|
+
cell.pos += delta;
|
2461
|
+
}
|
2462
|
+
}
|
2463
|
+
return;
|
2464
|
+
}
|
2465
|
+
|
2219
2466
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
2220
2467
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
2221
2468
|
cache.has_shift = true;
|
@@ -2249,6 +2496,17 @@ static void llama_kv_cache_seq_div(
|
|
2249
2496
|
if (p0 < 0) p0 = 0;
|
2250
2497
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
2251
2498
|
|
2499
|
+
if (cache.recurrent) {
|
2500
|
+
// for Mamba-like models, only the pos needs to be changed
|
2501
|
+
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
2502
|
+
llama_kv_cell & cell = cache.cells[seq_id];
|
2503
|
+
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
|
2504
|
+
cell.pos /= d;
|
2505
|
+
}
|
2506
|
+
}
|
2507
|
+
return;
|
2508
|
+
}
|
2509
|
+
|
2252
2510
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
2253
2511
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
2254
2512
|
cache.has_shift = true;
|
@@ -2891,7 +3149,11 @@ template<>
|
|
2891
3149
|
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
2892
3150
|
uint32_t tmp;
|
2893
3151
|
const bool found = get_key(kid, tmp, required);
|
2894
|
-
|
3152
|
+
if (found) {
|
3153
|
+
result = (enum llama_pooling_type) tmp;
|
3154
|
+
} else {
|
3155
|
+
result = LLAMA_POOLING_TYPE_UNSPECIFIED;
|
3156
|
+
}
|
2895
3157
|
return found;
|
2896
3158
|
}
|
2897
3159
|
|
@@ -2982,10 +3244,11 @@ static const char * llama_model_type_name(e_model type) {
|
|
2982
3244
|
|
2983
3245
|
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2984
3246
|
switch (type) {
|
2985
|
-
case
|
2986
|
-
case
|
2987
|
-
case
|
2988
|
-
|
3247
|
+
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
3248
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
3249
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
3250
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
3251
|
+
default: return "unknown";
|
2989
3252
|
}
|
2990
3253
|
}
|
2991
3254
|
|
@@ -3017,14 +3280,14 @@ static void llm_load_hparams(
|
|
3017
3280
|
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
3018
3281
|
|
3019
3282
|
// get hparams kv
|
3020
|
-
ml.get_arr_n(LLM_KV_TOKENIZER_LIST,
|
3021
|
-
ml.get_key
|
3022
|
-
ml.get_key
|
3023
|
-
ml.get_key
|
3024
|
-
ml.get_key
|
3025
|
-
ml.get_key
|
3026
|
-
ml.get_key
|
3027
|
-
ml.get_key
|
3283
|
+
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
3284
|
+
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
3285
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
3286
|
+
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
3287
|
+
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
3288
|
+
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
3289
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
3290
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
3028
3291
|
|
3029
3292
|
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
3030
3293
|
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
@@ -3064,7 +3327,7 @@ static void llm_load_hparams(
|
|
3064
3327
|
|
3065
3328
|
// sanity check for n_rot (optional)
|
3066
3329
|
{
|
3067
|
-
hparams.n_rot = hparams.n_embd / hparams.n_head;
|
3330
|
+
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
3068
3331
|
|
3069
3332
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
3070
3333
|
|
@@ -3077,10 +3340,10 @@ static void llm_load_hparams(
|
|
3077
3340
|
// gpt-j n_rot = rotary_dim
|
3078
3341
|
}
|
3079
3342
|
|
3080
|
-
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
|
3343
|
+
hparams.n_embd_head_k = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
3081
3344
|
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
3082
3345
|
|
3083
|
-
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
|
3346
|
+
hparams.n_embd_head_v = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
3084
3347
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
3085
3348
|
|
3086
3349
|
// arch-specific KVs
|
@@ -3168,7 +3431,7 @@ static void llm_load_hparams(
|
|
3168
3431
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3169
3432
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3170
3433
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3171
|
-
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3434
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
3172
3435
|
|
3173
3436
|
switch (hparams.n_layer) {
|
3174
3437
|
case 3:
|
@@ -3320,6 +3583,46 @@ static void llm_load_hparams(
|
|
3320
3583
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3321
3584
|
}
|
3322
3585
|
} break;
|
3586
|
+
case LLM_ARCH_STARCODER2:
|
3587
|
+
{
|
3588
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3589
|
+
switch (hparams.n_layer) {
|
3590
|
+
case 30: model.type = e_model::MODEL_3B; break;
|
3591
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3592
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
3593
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3594
|
+
}
|
3595
|
+
} break;
|
3596
|
+
case LLM_ARCH_MAMBA:
|
3597
|
+
{
|
3598
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
3599
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
3600
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
3601
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
3602
|
+
|
3603
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3604
|
+
|
3605
|
+
switch (hparams.n_layer) {
|
3606
|
+
case 24:
|
3607
|
+
switch (hparams.n_embd) {
|
3608
|
+
case 768: model.type = e_model::MODEL_SMALL; break;
|
3609
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3610
|
+
} break;
|
3611
|
+
case 48:
|
3612
|
+
switch (hparams.n_embd) {
|
3613
|
+
case 1024: model.type = e_model::MODEL_MEDIUM; break;
|
3614
|
+
case 1536: model.type = e_model::MODEL_LARGE; break;
|
3615
|
+
case 2048: model.type = e_model::MODEL_XL; break;
|
3616
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3617
|
+
} break;
|
3618
|
+
case 64:
|
3619
|
+
switch (hparams.n_embd) {
|
3620
|
+
case 2560: model.type = e_model::MODEL_3B; break;
|
3621
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3622
|
+
} break;
|
3623
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3624
|
+
}
|
3625
|
+
} break;
|
3323
3626
|
default: (void)0;
|
3324
3627
|
}
|
3325
3628
|
|
@@ -3345,30 +3648,25 @@ static void llm_load_vocab(
|
|
3345
3648
|
|
3346
3649
|
const auto kv = LLM_KV(model.arch);
|
3347
3650
|
|
3348
|
-
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
3349
|
-
if (token_idx == -1) {
|
3350
|
-
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
3351
|
-
}
|
3352
|
-
|
3353
|
-
const float * scores = nullptr;
|
3354
|
-
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
3355
|
-
if (score_idx != -1) {
|
3356
|
-
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
3357
|
-
}
|
3358
|
-
|
3359
|
-
const int * toktypes = nullptr;
|
3360
|
-
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
3361
|
-
if (toktype_idx != -1) {
|
3362
|
-
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
3363
|
-
}
|
3364
|
-
|
3365
3651
|
// determine vocab type
|
3366
3652
|
{
|
3367
3653
|
std::string tokenizer_name;
|
3368
3654
|
|
3369
3655
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
3370
3656
|
|
3371
|
-
if (tokenizer_name == "
|
3657
|
+
if (tokenizer_name == "no_vocab") {
|
3658
|
+
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
3659
|
+
|
3660
|
+
// default special tokens
|
3661
|
+
vocab.special_bos_id = -1;
|
3662
|
+
vocab.special_eos_id = -1;
|
3663
|
+
vocab.special_unk_id = -1;
|
3664
|
+
vocab.special_sep_id = -1;
|
3665
|
+
vocab.special_pad_id = -1;
|
3666
|
+
vocab.linefeed_id = -1;
|
3667
|
+
|
3668
|
+
return;
|
3669
|
+
} else if (tokenizer_name == "llama") {
|
3372
3670
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
3373
3671
|
|
3374
3672
|
// default special tokens
|
@@ -3395,7 +3693,7 @@ static void llm_load_vocab(
|
|
3395
3693
|
|
3396
3694
|
for (int i = 0; i < n_merges; i++) {
|
3397
3695
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
3398
|
-
GGML_ASSERT(
|
3696
|
+
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
3399
3697
|
|
3400
3698
|
std::string first;
|
3401
3699
|
std::string second;
|
@@ -3434,13 +3732,30 @@ static void llm_load_vocab(
|
|
3434
3732
|
}
|
3435
3733
|
}
|
3436
3734
|
|
3735
|
+
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
3736
|
+
if (token_idx == -1) {
|
3737
|
+
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
3738
|
+
}
|
3739
|
+
|
3740
|
+
const float * scores = nullptr;
|
3741
|
+
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
3742
|
+
if (score_idx != -1) {
|
3743
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
3744
|
+
}
|
3745
|
+
|
3746
|
+
const int * toktypes = nullptr;
|
3747
|
+
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
3748
|
+
if (toktype_idx != -1) {
|
3749
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
3750
|
+
}
|
3751
|
+
|
3437
3752
|
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
3438
3753
|
|
3439
3754
|
vocab.id_to_token.resize(n_vocab);
|
3440
3755
|
|
3441
3756
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
3442
3757
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
3443
|
-
GGML_ASSERT(
|
3758
|
+
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
3444
3759
|
|
3445
3760
|
vocab.token_to_id[word] = i;
|
3446
3761
|
|
@@ -3632,6 +3947,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3632
3947
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3633
3948
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3634
3949
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
3950
|
+
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
3635
3951
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
3636
3952
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
3637
3953
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
@@ -3639,6 +3955,10 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3639
3955
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
3640
3956
|
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
3641
3957
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
3958
|
+
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
3959
|
+
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
3960
|
+
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
3961
|
+
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
3642
3962
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
3643
3963
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
3644
3964
|
if (ml.n_elements >= 1e12) {
|
@@ -3692,6 +4012,7 @@ static bool llm_load_tensors(
|
|
3692
4012
|
|
3693
4013
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
3694
4014
|
model.buft_input = llama_default_buffer_type_cpu(true);
|
4015
|
+
//model.buft_input = llama_default_buffer_type_offload(main_gpu);
|
3695
4016
|
|
3696
4017
|
model.buft_layer.resize(n_layer);
|
3697
4018
|
|
@@ -3825,7 +4146,13 @@ static bool llm_load_tensors(
|
|
3825
4146
|
{
|
3826
4147
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3827
4148
|
if (model.arch != LLM_ARCH_MINICPM){
|
3828
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,
|
4149
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4150
|
+
// if output is NULL, init from the input tok embed
|
4151
|
+
if (model.output == NULL) {
|
4152
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4153
|
+
ml.n_created--; // artificial tensor
|
4154
|
+
ml.size_data += ggml_nbytes(model.output);
|
4155
|
+
}
|
3829
4156
|
}
|
3830
4157
|
}
|
3831
4158
|
|
@@ -4490,6 +4817,107 @@ static bool llm_load_tensors(
|
|
4490
4817
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4491
4818
|
}
|
4492
4819
|
} break;
|
4820
|
+
case LLM_ARCH_STARCODER2:
|
4821
|
+
{
|
4822
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4823
|
+
|
4824
|
+
// output
|
4825
|
+
{
|
4826
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4827
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
4828
|
+
|
4829
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4830
|
+
// if output is NULL, init from the input tok embed
|
4831
|
+
if (model.output == NULL) {
|
4832
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4833
|
+
ml.n_created--; // artificial tensor
|
4834
|
+
ml.size_data += ggml_nbytes(model.output);
|
4835
|
+
}
|
4836
|
+
|
4837
|
+
}
|
4838
|
+
|
4839
|
+
for (int i = 0; i < n_layer; ++i) {
|
4840
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4841
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4842
|
+
|
4843
|
+
auto & layer = model.layers[i];
|
4844
|
+
|
4845
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4846
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
4847
|
+
|
4848
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
4849
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
4850
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
4851
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4852
|
+
|
4853
|
+
// optional bias tensors
|
4854
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
4855
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
4856
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
4857
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
4858
|
+
|
4859
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4860
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
4861
|
+
|
4862
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
4863
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
4864
|
+
|
4865
|
+
// optional bias tensors
|
4866
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
4867
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
|
4868
|
+
}
|
4869
|
+
} break;
|
4870
|
+
case LLM_ARCH_MAMBA:
|
4871
|
+
{
|
4872
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
4873
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
4874
|
+
const int64_t d_state = hparams.ssm_d_state;
|
4875
|
+
const int64_t dt_rank = hparams.ssm_dt_rank;
|
4876
|
+
// only an expansion factor of 2 is supported for now
|
4877
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
4878
|
+
|
4879
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4880
|
+
|
4881
|
+
// output
|
4882
|
+
{
|
4883
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4884
|
+
|
4885
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
4886
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
4887
|
+
if (model.output == NULL) {
|
4888
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4889
|
+
ml.n_created--; // artificial tensor
|
4890
|
+
ml.size_data += ggml_nbytes(model.output);
|
4891
|
+
}
|
4892
|
+
}
|
4893
|
+
|
4894
|
+
for (int i = 0; i < n_layer; ++i) {
|
4895
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4896
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4897
|
+
|
4898
|
+
auto & layer = model.layers[i];
|
4899
|
+
|
4900
|
+
// norm
|
4901
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4902
|
+
|
4903
|
+
layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner});
|
4904
|
+
|
4905
|
+
layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner});
|
4906
|
+
layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner});
|
4907
|
+
|
4908
|
+
layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state});
|
4909
|
+
|
4910
|
+
layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner});
|
4911
|
+
layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner});
|
4912
|
+
|
4913
|
+
// no "weight" suffix for these
|
4914
|
+
layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner});
|
4915
|
+
layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, i), {d_inner});
|
4916
|
+
|
4917
|
+
// out_proj
|
4918
|
+
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
4919
|
+
}
|
4920
|
+
} break;
|
4493
4921
|
default:
|
4494
4922
|
throw std::runtime_error("unknown architecture");
|
4495
4923
|
}
|
@@ -4610,7 +5038,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4610
5038
|
|
4611
5039
|
llm_load_print_meta(ml, model);
|
4612
5040
|
|
4613
|
-
if (model.
|
5041
|
+
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
5042
|
+
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
4614
5043
|
throw std::runtime_error("vocab size mismatch");
|
4615
5044
|
}
|
4616
5045
|
|
@@ -4674,29 +5103,32 @@ enum llm_norm_type {
|
|
4674
5103
|
|
4675
5104
|
static struct ggml_tensor * llm_build_inp_embd(
|
4676
5105
|
struct ggml_context * ctx,
|
5106
|
+
struct llama_context & lctx,
|
4677
5107
|
const llama_hparams & hparams,
|
4678
5108
|
const llama_batch & batch,
|
4679
5109
|
struct ggml_tensor * tok_embd,
|
4680
|
-
struct ggml_tensor * inp_tokens,
|
4681
|
-
struct ggml_tensor * inp_embd,
|
4682
5110
|
const llm_build_cb & cb) {
|
4683
5111
|
const int64_t n_embd = hparams.n_embd;
|
4684
5112
|
|
4685
5113
|
struct ggml_tensor * inpL;
|
4686
5114
|
|
4687
5115
|
if (batch.token) {
|
4688
|
-
|
4689
|
-
cb(inp_tokens, "inp_tokens", -1);
|
5116
|
+
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
5117
|
+
cb(lctx.inp_tokens, "inp_tokens", -1);
|
5118
|
+
ggml_set_input(lctx.inp_tokens);
|
4690
5119
|
|
4691
|
-
inpL = ggml_get_rows(ctx, tok_embd,
|
5120
|
+
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
4692
5121
|
} else {
|
4693
5122
|
#ifdef GGML_USE_MPI
|
4694
5123
|
GGML_ASSERT(false && "not implemented");
|
4695
5124
|
#endif
|
4696
|
-
|
4697
|
-
inpL =
|
5125
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
5126
|
+
inpL = lctx.inp_embd;
|
5127
|
+
ggml_set_input(lctx.inp_embd);
|
4698
5128
|
}
|
4699
5129
|
|
5130
|
+
cb(inpL, "inp_embd", -1);
|
5131
|
+
|
4700
5132
|
return inpL;
|
4701
5133
|
}
|
4702
5134
|
|
@@ -4715,6 +5147,8 @@ static void llm_build_kv_store(
|
|
4715
5147
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4716
5148
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4717
5149
|
|
5150
|
+
GGML_ASSERT(kv.size == n_ctx);
|
5151
|
+
|
4718
5152
|
// compute the transposed [n_tokens, n_embd] V matrix
|
4719
5153
|
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
|
4720
5154
|
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
@@ -4901,8 +5335,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4901
5335
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4902
5336
|
}
|
4903
5337
|
|
4904
|
-
#if defined(
|
4905
|
-
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for
|
5338
|
+
#if defined(GGML_USE_KOMPUTE)
|
5339
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
|
4906
5340
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4907
5341
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4908
5342
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
@@ -4924,6 +5358,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4924
5358
|
cb(kq, "kq_soft_max_ext", il);
|
4925
5359
|
}
|
4926
5360
|
|
5361
|
+
GGML_ASSERT(kv.size == n_ctx);
|
5362
|
+
|
4927
5363
|
// split cached v into n_head heads
|
4928
5364
|
struct ggml_tensor * v =
|
4929
5365
|
ggml_view_3d(ctx, kv.v_l[il],
|
@@ -4986,6 +5422,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
4986
5422
|
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
4987
5423
|
|
4988
5424
|
struct ggml_tensor * cur;
|
5425
|
+
|
4989
5426
|
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
4990
5427
|
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
4991
5428
|
cb(cur, "kqv_out", il);
|
@@ -4995,7 +5432,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
4995
5432
|
|
4996
5433
|
struct llm_build_context {
|
4997
5434
|
const llama_model & model;
|
4998
|
-
|
5435
|
+
llama_context & lctx;
|
4999
5436
|
const llama_hparams & hparams;
|
5000
5437
|
const llama_cparams & cparams;
|
5001
5438
|
const llama_batch & batch;
|
@@ -5070,10 +5507,10 @@ struct llm_build_context {
|
|
5070
5507
|
norm_eps (hparams.f_norm_eps),
|
5071
5508
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
5072
5509
|
n_tokens (batch.n_tokens),
|
5073
|
-
n_kv (worst_case ?
|
5074
|
-
kv_head (worst_case ?
|
5510
|
+
n_kv (worst_case ? kv_self.size : kv_self.n),
|
5511
|
+
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
5075
5512
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5076
|
-
pooling_type (cparams.
|
5513
|
+
pooling_type (cparams.pooling_type),
|
5077
5514
|
rope_type (hparams.rope_type),
|
5078
5515
|
cb (cb),
|
5079
5516
|
buf_compute_meta (lctx.buf_compute_meta) {
|
@@ -5088,6 +5525,18 @@ struct llm_build_context {
|
|
5088
5525
|
};
|
5089
5526
|
|
5090
5527
|
ctx0 = ggml_init(params);
|
5528
|
+
|
5529
|
+
lctx.inp_tokens = nullptr;
|
5530
|
+
lctx.inp_embd = nullptr;
|
5531
|
+
lctx.inp_pos = nullptr;
|
5532
|
+
lctx.inp_KQ_mask = nullptr;
|
5533
|
+
lctx.inp_KQ_pos = nullptr;
|
5534
|
+
lctx.inp_K_shift = nullptr;
|
5535
|
+
lctx.inp_mean = nullptr;
|
5536
|
+
lctx.inp_cls = nullptr;
|
5537
|
+
lctx.inp_s_copy = nullptr;
|
5538
|
+
lctx.inp_s_mask = nullptr;
|
5539
|
+
lctx.inp_s_seq = nullptr;
|
5091
5540
|
}
|
5092
5541
|
|
5093
5542
|
void free() {
|
@@ -5100,6 +5549,12 @@ struct llm_build_context {
|
|
5100
5549
|
struct ggml_cgraph * build_k_shift() {
|
5101
5550
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5102
5551
|
|
5552
|
+
GGML_ASSERT(kv_self.size == n_ctx);
|
5553
|
+
|
5554
|
+
lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
5555
|
+
cb(lctx.inp_K_shift, "K_shift", -1);
|
5556
|
+
ggml_set_input(lctx.inp_K_shift);
|
5557
|
+
|
5103
5558
|
for (int il = 0; il < n_layer; ++il) {
|
5104
5559
|
struct ggml_tensor * tmp =
|
5105
5560
|
// we rotate only the first n_rot dimensions
|
@@ -5118,6 +5573,29 @@ struct llm_build_context {
|
|
5118
5573
|
return gf;
|
5119
5574
|
}
|
5120
5575
|
|
5576
|
+
struct ggml_cgraph * build_s_copy() {
|
5577
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5578
|
+
|
5579
|
+
GGML_ASSERT(kv_self.recurrent);
|
5580
|
+
|
5581
|
+
struct ggml_tensor * state_copy = build_inp_s_copy();
|
5582
|
+
|
5583
|
+
for (int il = 0; il < n_layer; ++il) {
|
5584
|
+
struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
|
5585
|
+
struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
|
5586
|
+
|
5587
|
+
conv_states = ggml_get_rows(ctx0, conv_states, state_copy);
|
5588
|
+
ssm_states = ggml_get_rows(ctx0, ssm_states, state_copy);
|
5589
|
+
|
5590
|
+
// TODO: name the intermediate tensors with cb()
|
5591
|
+
|
5592
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_states, kv_self.k_l[il]));
|
5593
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, ssm_states, kv_self.v_l[il]));
|
5594
|
+
}
|
5595
|
+
|
5596
|
+
return gf;
|
5597
|
+
}
|
5598
|
+
|
5121
5599
|
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
5122
5600
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5123
5601
|
|
@@ -5167,6 +5645,66 @@ struct llm_build_context {
|
|
5167
5645
|
return gf;
|
5168
5646
|
}
|
5169
5647
|
|
5648
|
+
struct ggml_tensor * build_inp_pos() {
|
5649
|
+
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5650
|
+
cb(lctx.inp_pos, "inp_pos", -1);
|
5651
|
+
ggml_set_input(lctx.inp_pos);
|
5652
|
+
return lctx.inp_pos;
|
5653
|
+
}
|
5654
|
+
|
5655
|
+
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
5656
|
+
if (causal) {
|
5657
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
|
5658
|
+
} else {
|
5659
|
+
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
5660
|
+
}
|
5661
|
+
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
5662
|
+
ggml_set_input(lctx.inp_KQ_mask);
|
5663
|
+
return lctx.inp_KQ_mask;
|
5664
|
+
}
|
5665
|
+
|
5666
|
+
struct ggml_tensor * build_inp_KQ_pos() {
|
5667
|
+
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
5668
|
+
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
5669
|
+
ggml_set_input(lctx.inp_KQ_pos);
|
5670
|
+
return lctx.inp_KQ_pos;
|
5671
|
+
}
|
5672
|
+
|
5673
|
+
struct ggml_tensor * build_inp_mean() {
|
5674
|
+
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
5675
|
+
cb(lctx.inp_mean, "inp_mean", -1);
|
5676
|
+
ggml_set_input(lctx.inp_mean);
|
5677
|
+
return lctx.inp_mean;
|
5678
|
+
}
|
5679
|
+
|
5680
|
+
struct ggml_tensor * build_inp_cls() {
|
5681
|
+
lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5682
|
+
cb(lctx.inp_cls, "inp_cls", -1);
|
5683
|
+
ggml_set_input(lctx.inp_cls);
|
5684
|
+
return lctx.inp_cls;
|
5685
|
+
}
|
5686
|
+
|
5687
|
+
struct ggml_tensor * build_inp_s_copy() {
|
5688
|
+
lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, kv_self.size);
|
5689
|
+
cb(lctx.inp_s_copy, "inp_s_copy", -1);
|
5690
|
+
ggml_set_input(lctx.inp_s_copy);
|
5691
|
+
return lctx.inp_s_copy;
|
5692
|
+
}
|
5693
|
+
|
5694
|
+
struct ggml_tensor * build_inp_s_mask() {
|
5695
|
+
lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
|
5696
|
+
cb(lctx.inp_s_mask, "inp_s_mask", -1);
|
5697
|
+
ggml_set_input(lctx.inp_s_mask);
|
5698
|
+
return lctx.inp_s_mask;
|
5699
|
+
}
|
5700
|
+
|
5701
|
+
struct ggml_tensor * build_inp_s_seq() {
|
5702
|
+
lctx.inp_s_seq = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
|
5703
|
+
cb(lctx.inp_s_seq, "inp_s_seq", -1);
|
5704
|
+
ggml_set_input(lctx.inp_s_seq);
|
5705
|
+
return lctx.inp_s_seq;
|
5706
|
+
}
|
5707
|
+
|
5170
5708
|
struct ggml_cgraph * build_llama() {
|
5171
5709
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5172
5710
|
|
@@ -5177,16 +5715,13 @@ struct llm_build_context {
|
|
5177
5715
|
struct ggml_tensor * cur;
|
5178
5716
|
struct ggml_tensor * inpL;
|
5179
5717
|
|
5180
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5181
|
-
cb(inpL, "inp_embd", -1);
|
5718
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5182
5719
|
|
5183
5720
|
// inp_pos - contains the positions
|
5184
|
-
struct ggml_tensor * inp_pos =
|
5185
|
-
cb(inp_pos, "inp_pos", -1);
|
5721
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5186
5722
|
|
5187
5723
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5188
|
-
struct ggml_tensor * KQ_mask =
|
5189
|
-
cb(KQ_mask, "KQ_mask", -1);
|
5724
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5190
5725
|
|
5191
5726
|
for (int il = 0; il < n_layer; ++il) {
|
5192
5727
|
struct ggml_tensor * inpSA = inpL;
|
@@ -5238,7 +5773,6 @@ struct llm_build_context {
|
|
5238
5773
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5239
5774
|
model.layers[il].wo, model.layers[il].bo,
|
5240
5775
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5241
|
-
cb(cur, "kqv_out", il);
|
5242
5776
|
}
|
5243
5777
|
|
5244
5778
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -5356,20 +5890,16 @@ struct llm_build_context {
|
|
5356
5890
|
struct ggml_tensor * cur;
|
5357
5891
|
struct ggml_tensor * inpL;
|
5358
5892
|
|
5359
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5360
|
-
cb(inpL, "inp_embd", -1);
|
5893
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5361
5894
|
|
5362
5895
|
// inp_pos - contains the positions
|
5363
|
-
struct ggml_tensor * inp_pos =
|
5364
|
-
cb(inp_pos, "inp_pos", -1);
|
5896
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5365
5897
|
|
5366
5898
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5367
|
-
struct ggml_tensor * KQ_mask =
|
5368
|
-
cb(KQ_mask, "KQ_mask", -1);
|
5899
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5369
5900
|
|
5370
5901
|
// positions of the tokens in the KV cache
|
5371
|
-
struct ggml_tensor * KQ_pos =
|
5372
|
-
cb(KQ_pos, "KQ_pos", -1);
|
5902
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
5373
5903
|
|
5374
5904
|
for (int il = 0; il < n_layer; ++il) {
|
5375
5905
|
struct ggml_tensor * inpSA = inpL;
|
@@ -5417,7 +5947,6 @@ struct llm_build_context {
|
|
5417
5947
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5418
5948
|
model.layers[il].wo, NULL,
|
5419
5949
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5420
|
-
cb(cur, "kqv_out", il);
|
5421
5950
|
}
|
5422
5951
|
|
5423
5952
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -5473,16 +6002,13 @@ struct llm_build_context {
|
|
5473
6002
|
struct ggml_tensor * cur;
|
5474
6003
|
struct ggml_tensor * inpL;
|
5475
6004
|
|
5476
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5477
|
-
cb(inpL, "inp_embd", -1);
|
6005
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5478
6006
|
|
5479
6007
|
// inp_pos - contains the positions
|
5480
|
-
struct ggml_tensor * inp_pos =
|
5481
|
-
cb(inp_pos, "inp_pos", -1);
|
6008
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5482
6009
|
|
5483
6010
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5484
|
-
struct ggml_tensor * KQ_mask =
|
5485
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6011
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5486
6012
|
|
5487
6013
|
for (int il = 0; il < n_layer; ++il) {
|
5488
6014
|
struct ggml_tensor * attn_norm;
|
@@ -5536,7 +6062,6 @@ struct llm_build_context {
|
|
5536
6062
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5537
6063
|
model.layers[il].wo, NULL,
|
5538
6064
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5539
|
-
cb(cur, "kqv_out", il);
|
5540
6065
|
}
|
5541
6066
|
|
5542
6067
|
struct ggml_tensor * ffn_inp = cur;
|
@@ -5587,21 +6112,17 @@ struct llm_build_context {
|
|
5587
6112
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5588
6113
|
|
5589
6114
|
struct ggml_tensor * cur;
|
5590
|
-
struct ggml_tensor * pos;
|
5591
6115
|
struct ggml_tensor * inpL;
|
5592
6116
|
|
5593
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5594
|
-
cb(inpL, "inp_embd", -1);
|
6117
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5595
6118
|
|
5596
6119
|
// inp_pos - contains the positions
|
5597
|
-
struct ggml_tensor * inp_pos =
|
5598
|
-
cb(inp_pos, "inp_pos", -1);
|
6120
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5599
6121
|
|
5600
6122
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5601
|
-
struct ggml_tensor * KQ_mask =
|
5602
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6123
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5603
6124
|
|
5604
|
-
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
6125
|
+
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
5605
6126
|
cb(pos, "pos_embd", -1);
|
5606
6127
|
|
5607
6128
|
inpL = ggml_add(ctx0, inpL, pos);
|
@@ -5635,7 +6156,6 @@ struct llm_build_context {
|
|
5635
6156
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5636
6157
|
model.layers[il].wo, model.layers[il].bo,
|
5637
6158
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5638
|
-
cb(cur, "kqv_out", il);
|
5639
6159
|
}
|
5640
6160
|
|
5641
6161
|
// add the input
|
@@ -5687,16 +6207,13 @@ struct llm_build_context {
|
|
5687
6207
|
struct ggml_tensor * cur;
|
5688
6208
|
struct ggml_tensor * inpL;
|
5689
6209
|
|
5690
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5691
|
-
cb(inpL, "inp_embd", -1);
|
6210
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5692
6211
|
|
5693
6212
|
// inp_pos - contains the positions
|
5694
|
-
struct ggml_tensor * inp_pos =
|
5695
|
-
cb(inp_pos, "inp_pos", -1);
|
6213
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
5696
6214
|
|
5697
6215
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5698
|
-
struct ggml_tensor * KQ_mask =
|
5699
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6216
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5700
6217
|
|
5701
6218
|
for (int il = 0; il < n_layer; ++il) {
|
5702
6219
|
struct ggml_tensor * residual = inpL;
|
@@ -5836,7 +6353,6 @@ struct llm_build_context {
|
|
5836
6353
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5837
6354
|
model.layers[il].wo, model.layers[il].bo,
|
5838
6355
|
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5839
|
-
cb(cur, "kqv_out", il);
|
5840
6356
|
}
|
5841
6357
|
|
5842
6358
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
@@ -5890,16 +6406,13 @@ struct llm_build_context {
|
|
5890
6406
|
struct ggml_tensor * cur;
|
5891
6407
|
struct ggml_tensor * inpL;
|
5892
6408
|
|
5893
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
5894
|
-
cb(inpL, "inp_embd", -1);
|
6409
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5895
6410
|
|
5896
6411
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5897
|
-
struct ggml_tensor * KQ_mask =
|
5898
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6412
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
5899
6413
|
|
5900
6414
|
// positions of the tokens in the KV cache
|
5901
|
-
struct ggml_tensor * KQ_pos =
|
5902
|
-
cb(KQ_pos, "KQ_pos", -1);
|
6415
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
5903
6416
|
|
5904
6417
|
for (int il = 0; il < n_layer; ++il) {
|
5905
6418
|
struct ggml_tensor * inpSA = inpL;
|
@@ -5929,7 +6442,6 @@ struct llm_build_context {
|
|
5929
6442
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
5930
6443
|
model.layers[il].wo, NULL,
|
5931
6444
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
5932
|
-
cb(cur, "kqv_out", il);
|
5933
6445
|
}
|
5934
6446
|
|
5935
6447
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -5979,19 +6491,18 @@ struct llm_build_context {
|
|
5979
6491
|
|
5980
6492
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5981
6493
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
6494
|
+
|
5982
6495
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5983
6496
|
|
5984
6497
|
struct ggml_tensor * cur;
|
5985
6498
|
struct ggml_tensor * inpL;
|
5986
6499
|
|
5987
|
-
|
5988
|
-
|
5989
|
-
struct ggml_tensor *
|
5990
|
-
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
5991
|
-
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
6500
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6501
|
+
struct ggml_tensor * inp_mean = build_inp_mean();
|
6502
|
+
struct ggml_tensor * inp_cls = build_inp_cls();
|
5992
6503
|
|
5993
6504
|
// construct input embeddings (token, type, position)
|
5994
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6505
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
5995
6506
|
|
5996
6507
|
// token types are hardcoded to zero ("Sentence A")
|
5997
6508
|
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
@@ -6006,39 +6517,37 @@ struct llm_build_context {
|
|
6006
6517
|
cb(inpL, "inp_norm", -1);
|
6007
6518
|
|
6008
6519
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6009
|
-
struct ggml_tensor * KQ_mask =
|
6010
|
-
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
6520
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
|
6011
6521
|
|
6012
6522
|
// iterate layers
|
6013
6523
|
for (int il = 0; il < n_layer; ++il) {
|
6014
6524
|
struct ggml_tensor * cur = inpL;
|
6015
6525
|
|
6526
|
+
struct ggml_tensor * Qcur;
|
6527
|
+
struct ggml_tensor * Kcur;
|
6528
|
+
struct ggml_tensor * Vcur;
|
6529
|
+
|
6016
6530
|
// self-attention
|
6017
6531
|
if (model.arch == LLM_ARCH_BERT) {
|
6018
|
-
|
6532
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
6019
6533
|
cb(Qcur, "Qcur", il);
|
6020
6534
|
|
6021
|
-
|
6535
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
6022
6536
|
cb(Kcur, "Kcur", il);
|
6023
6537
|
|
6024
|
-
|
6538
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
6025
6539
|
cb(Vcur, "Vcur", il);
|
6026
6540
|
|
6027
|
-
|
6028
|
-
|
6029
|
-
|
6030
|
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6031
|
-
model.layers[il].wo, model.layers[il].bo,
|
6032
|
-
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6033
|
-
cb(cur, "kqv_out", il);
|
6541
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
6542
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
6034
6543
|
} else {
|
6035
6544
|
// compute Q and K and RoPE them
|
6036
6545
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
6037
6546
|
cb(cur, "wqkv", il);
|
6038
6547
|
|
6039
|
-
|
6040
|
-
|
6041
|
-
|
6548
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
6549
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
6550
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
6042
6551
|
|
6043
6552
|
cb(Qcur, "Qcur", il);
|
6044
6553
|
cb(Kcur, "Kcur", il);
|
@@ -6057,12 +6566,40 @@ struct llm_build_context {
|
|
6057
6566
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6058
6567
|
);
|
6059
6568
|
cb(Kcur, "Kcur", il);
|
6569
|
+
}
|
6060
6570
|
|
6061
|
-
|
6062
|
-
|
6063
|
-
|
6064
|
-
|
6571
|
+
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
6572
|
+
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
6573
|
+
|
6574
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
6575
|
+
cb(kq, "kq", il);
|
6576
|
+
|
6577
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
6578
|
+
cb(kq, "kq_soft_max_ext", il);
|
6579
|
+
|
6580
|
+
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
6581
|
+
cb(v, "v", il);
|
6582
|
+
|
6583
|
+
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
|
6584
|
+
cb(kqv, "kqv", il);
|
6585
|
+
|
6586
|
+
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
6587
|
+
cb(kqv_merged, "kqv_merged", il);
|
6588
|
+
|
6589
|
+
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
|
6590
|
+
cb(cur, "kqv_merged_cont", il);
|
6591
|
+
|
6592
|
+
ggml_build_forward_expand(gf, cur);
|
6593
|
+
|
6594
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
6595
|
+
if (model.layers[il].bo) {
|
6596
|
+
cb(cur, "kqv_wo", il);
|
6597
|
+
}
|
6598
|
+
|
6599
|
+
if (model.layers[il].bo) {
|
6600
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
6065
6601
|
}
|
6602
|
+
cb(cur, "kqv_out", il);
|
6066
6603
|
|
6067
6604
|
// re-add the layer input
|
6068
6605
|
cur = ggml_add(ctx0, cur, inpL);
|
@@ -6103,16 +6640,29 @@ struct llm_build_context {
|
|
6103
6640
|
|
6104
6641
|
// final output
|
6105
6642
|
cur = inpL;
|
6643
|
+
cb(cur, "result_embd", -1);
|
6106
6644
|
|
6107
6645
|
// pooling layer
|
6108
|
-
|
6109
|
-
|
6110
|
-
|
6111
|
-
|
6112
|
-
|
6113
|
-
|
6646
|
+
switch (pooling_type) {
|
6647
|
+
case LLAMA_POOLING_TYPE_NONE:
|
6648
|
+
{
|
6649
|
+
// nop
|
6650
|
+
} break;
|
6651
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
6652
|
+
{
|
6653
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6654
|
+
cb(cur, "result_embd_pooled", -1);
|
6655
|
+
} break;
|
6656
|
+
case LLAMA_POOLING_TYPE_CLS:
|
6657
|
+
{
|
6658
|
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6659
|
+
cb(cur, "result_embd_pooled", -1);
|
6660
|
+
} break;
|
6661
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
6662
|
+
{
|
6663
|
+
GGML_ASSERT(false && "Invalid pooling type");
|
6664
|
+
} break;
|
6114
6665
|
}
|
6115
|
-
cb(cur, "result_embd", -1);
|
6116
6666
|
|
6117
6667
|
ggml_build_forward_expand(gf, cur);
|
6118
6668
|
|
@@ -6129,16 +6679,13 @@ struct llm_build_context {
|
|
6129
6679
|
struct ggml_tensor * cur;
|
6130
6680
|
struct ggml_tensor * inpL;
|
6131
6681
|
|
6132
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6133
|
-
cb(inpL, "inp_embd", -1);
|
6682
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6134
6683
|
|
6135
6684
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6136
|
-
struct ggml_tensor * KQ_mask =
|
6137
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6685
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6138
6686
|
|
6139
6687
|
// positions of the tokens in the KV cache
|
6140
|
-
struct ggml_tensor * KQ_pos =
|
6141
|
-
cb(KQ_pos, "KQ_pos", -1);
|
6688
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6142
6689
|
|
6143
6690
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
6144
6691
|
model.tok_norm,
|
@@ -6174,7 +6721,6 @@ struct llm_build_context {
|
|
6174
6721
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6175
6722
|
model.layers[il].wo, model.layers[il].bo,
|
6176
6723
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6177
|
-
cb(cur, "kqv_out", il);
|
6178
6724
|
}
|
6179
6725
|
|
6180
6726
|
// Add the input
|
@@ -6226,16 +6772,13 @@ struct llm_build_context {
|
|
6226
6772
|
struct ggml_tensor * cur;
|
6227
6773
|
struct ggml_tensor * inpL;
|
6228
6774
|
|
6229
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6230
|
-
cb(inpL, "inp_embd", -1);
|
6775
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6231
6776
|
|
6232
6777
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6233
|
-
struct ggml_tensor * KQ_mask =
|
6234
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6778
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6235
6779
|
|
6236
6780
|
// positions of the tokens in the KV cache
|
6237
|
-
struct ggml_tensor * KQ_pos =
|
6238
|
-
cb(KQ_pos, "KQ_pos", -1);
|
6781
|
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
6239
6782
|
|
6240
6783
|
for (int il = 0; il < n_layer; ++il) {
|
6241
6784
|
struct ggml_tensor * attn_norm;
|
@@ -6276,7 +6819,6 @@ struct llm_build_context {
|
|
6276
6819
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6277
6820
|
model.layers[il].wo, model.layers[il].bo,
|
6278
6821
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6279
|
-
cb(cur, "kqv_out", il);
|
6280
6822
|
}
|
6281
6823
|
|
6282
6824
|
// Add the input
|
@@ -6331,16 +6873,13 @@ struct llm_build_context {
|
|
6331
6873
|
struct ggml_tensor * cur;
|
6332
6874
|
struct ggml_tensor * inpL;
|
6333
6875
|
|
6334
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6335
|
-
cb(inpL, "inp_embd", -1);
|
6876
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6336
6877
|
|
6337
6878
|
// inp_pos - contains the positions
|
6338
|
-
struct ggml_tensor * inp_pos =
|
6339
|
-
cb(inp_pos, "inp_pos", -1);
|
6879
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6340
6880
|
|
6341
6881
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6342
|
-
struct ggml_tensor * KQ_mask =
|
6343
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6882
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6344
6883
|
|
6345
6884
|
for (int il = 0; il < n_layer; ++il) {
|
6346
6885
|
struct ggml_tensor * inpSA = inpL;
|
@@ -6393,7 +6932,6 @@ struct llm_build_context {
|
|
6393
6932
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6394
6933
|
model.layers[il].wo, NULL,
|
6395
6934
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6396
|
-
cb(cur, "kqv_out", il);
|
6397
6935
|
}
|
6398
6936
|
|
6399
6937
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -6449,16 +6987,13 @@ struct llm_build_context {
|
|
6449
6987
|
struct ggml_tensor * cur;
|
6450
6988
|
struct ggml_tensor * inpL;
|
6451
6989
|
|
6452
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6453
|
-
cb(inpL, "inp_embd", -1);
|
6990
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6454
6991
|
|
6455
6992
|
// inp_pos - contains the positions
|
6456
|
-
struct ggml_tensor * inp_pos =
|
6457
|
-
cb(inp_pos, "inp_pos", -1);
|
6993
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6458
6994
|
|
6459
6995
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6460
|
-
struct ggml_tensor * KQ_mask =
|
6461
|
-
cb(KQ_mask, "KQ_mask", -1);
|
6996
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6462
6997
|
|
6463
6998
|
for (int il = 0; il < n_layer; ++il) {
|
6464
6999
|
struct ggml_tensor * inpSA = inpL;
|
@@ -6503,7 +7038,6 @@ struct llm_build_context {
|
|
6503
7038
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6504
7039
|
model.layers[il].wo, NULL,
|
6505
7040
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6506
|
-
cb(cur, "kqv_out", il);
|
6507
7041
|
}
|
6508
7042
|
|
6509
7043
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -6558,16 +7092,13 @@ struct llm_build_context {
|
|
6558
7092
|
struct ggml_tensor * cur;
|
6559
7093
|
struct ggml_tensor * inpL;
|
6560
7094
|
|
6561
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6562
|
-
cb(inpL, "inp_embd", -1);
|
7095
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6563
7096
|
|
6564
7097
|
// inp_pos - contains the positions
|
6565
|
-
struct ggml_tensor * inp_pos =
|
6566
|
-
cb(inp_pos, "inp_pos", -1);
|
7098
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6567
7099
|
|
6568
7100
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6569
|
-
struct ggml_tensor * KQ_mask =
|
6570
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7101
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6571
7102
|
|
6572
7103
|
for (int il = 0; il < n_layer; ++il) {
|
6573
7104
|
struct ggml_tensor * inpSA = inpL;
|
@@ -6619,7 +7150,6 @@ struct llm_build_context {
|
|
6619
7150
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6620
7151
|
model.layers[il].wo, model.layers[il].bo,
|
6621
7152
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6622
|
-
cb(cur, "kqv_out", il);
|
6623
7153
|
}
|
6624
7154
|
|
6625
7155
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -6674,16 +7204,13 @@ struct llm_build_context {
|
|
6674
7204
|
struct ggml_tensor * ffn_output;
|
6675
7205
|
struct ggml_tensor * inpL;
|
6676
7206
|
|
6677
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6678
|
-
cb(inpL, "inp_embd", -1);
|
7207
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6679
7208
|
|
6680
7209
|
// inp_pos - contains the positions
|
6681
|
-
struct ggml_tensor * inp_pos =
|
6682
|
-
cb(inp_pos, "inp_pos", -1);
|
7210
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6683
7211
|
|
6684
7212
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6685
|
-
struct ggml_tensor * KQ_mask =
|
6686
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7213
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6687
7214
|
|
6688
7215
|
for (int il = 0; il < n_layer; ++il) {
|
6689
7216
|
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
@@ -6741,7 +7268,6 @@ struct llm_build_context {
|
|
6741
7268
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6742
7269
|
model.layers[il].wo, model.layers[il].bo,
|
6743
7270
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
6744
|
-
cb(cur, "kqv_out", il);
|
6745
7271
|
}
|
6746
7272
|
|
6747
7273
|
// FF
|
@@ -6791,16 +7317,13 @@ struct llm_build_context {
|
|
6791
7317
|
struct ggml_tensor * cur;
|
6792
7318
|
struct ggml_tensor * inpL;
|
6793
7319
|
|
6794
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6795
|
-
cb(inpL, "inp_embd", -1);
|
7320
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6796
7321
|
|
6797
7322
|
// inp_pos - contains the positions
|
6798
|
-
struct ggml_tensor * inp_pos =
|
6799
|
-
cb(inp_pos, "inp_pos", -1);
|
7323
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6800
7324
|
|
6801
7325
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6802
|
-
struct ggml_tensor * KQ_mask =
|
6803
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7326
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6804
7327
|
|
6805
7328
|
for (int il = 0; il < n_layer; ++il) {
|
6806
7329
|
|
@@ -6839,7 +7362,6 @@ struct llm_build_context {
|
|
6839
7362
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6840
7363
|
model.layers[il].wo, NULL,
|
6841
7364
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6842
|
-
cb(cur, "kqv_out", il);
|
6843
7365
|
}
|
6844
7366
|
struct ggml_tensor * sa_out = cur;
|
6845
7367
|
|
@@ -6893,16 +7415,13 @@ struct llm_build_context {
|
|
6893
7415
|
struct ggml_tensor * pos;
|
6894
7416
|
struct ggml_tensor * inpL;
|
6895
7417
|
|
6896
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6897
|
-
cb(inpL, "inp_embd", -1);
|
7418
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6898
7419
|
|
6899
7420
|
// inp_pos - contains the positions
|
6900
|
-
struct ggml_tensor * inp_pos =
|
6901
|
-
cb(inp_pos, "inp_pos", -1);
|
7421
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
6902
7422
|
|
6903
7423
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
6904
|
-
struct ggml_tensor * KQ_mask =
|
6905
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7424
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
6906
7425
|
|
6907
7426
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
6908
7427
|
cb(pos, "pos_embd", -1);
|
@@ -6938,7 +7457,6 @@ struct llm_build_context {
|
|
6938
7457
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
6939
7458
|
model.layers[il].wo, model.layers[il].bo,
|
6940
7459
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
6941
|
-
cb(cur, "kqv_out", il);
|
6942
7460
|
}
|
6943
7461
|
|
6944
7462
|
// add the input
|
@@ -6991,16 +7509,13 @@ struct llm_build_context {
|
|
6991
7509
|
struct ggml_tensor * cur;
|
6992
7510
|
struct ggml_tensor * inpL;
|
6993
7511
|
|
6994
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
6995
|
-
cb(inpL, "inp_embd", -1);
|
7512
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
6996
7513
|
|
6997
7514
|
// inp_pos - contains the positions
|
6998
|
-
struct ggml_tensor * inp_pos =
|
6999
|
-
cb(inp_pos, "inp_pos", -1);
|
7515
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7000
7516
|
|
7001
7517
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7002
|
-
struct ggml_tensor * KQ_mask =
|
7003
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7518
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7004
7519
|
|
7005
7520
|
for (int il = 0; il < n_layer; ++il) {
|
7006
7521
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
@@ -7042,7 +7557,6 @@ struct llm_build_context {
|
|
7042
7557
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7043
7558
|
model.layers[il].wo, model.layers[il].bo,
|
7044
7559
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7045
|
-
cb(cur, "kqv_out", il);
|
7046
7560
|
}
|
7047
7561
|
|
7048
7562
|
// add the input
|
@@ -7094,16 +7608,13 @@ struct llm_build_context {
|
|
7094
7608
|
struct ggml_tensor * cur;
|
7095
7609
|
struct ggml_tensor * inpL;
|
7096
7610
|
|
7097
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
7098
|
-
cb(inpL, "inp_embd", -1);
|
7611
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7099
7612
|
|
7100
7613
|
// inp_pos - contains the positions
|
7101
|
-
struct ggml_tensor * inp_pos =
|
7102
|
-
cb(inp_pos, "inp_pos", -1);
|
7614
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7103
7615
|
|
7104
7616
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7105
|
-
struct ggml_tensor * KQ_mask =
|
7106
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7617
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7107
7618
|
|
7108
7619
|
for (int il = 0; il < n_layer; ++il) {
|
7109
7620
|
struct ggml_tensor * inpSA = inpL;
|
@@ -7155,7 +7666,6 @@ struct llm_build_context {
|
|
7155
7666
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7156
7667
|
model.layers[il].wo, NULL,
|
7157
7668
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7158
|
-
cb(cur, "kqv_out", il);
|
7159
7669
|
}
|
7160
7670
|
|
7161
7671
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -7208,16 +7718,13 @@ struct llm_build_context {
|
|
7208
7718
|
struct ggml_tensor * cur;
|
7209
7719
|
struct ggml_tensor * inpL;
|
7210
7720
|
|
7211
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
7212
|
-
cb(inpL, "inp_embd", -1);
|
7721
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7213
7722
|
|
7214
7723
|
// inp_pos - contains the positions
|
7215
|
-
struct ggml_tensor * inp_pos =
|
7216
|
-
cb(inp_pos, "inp_pos", -1);
|
7724
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7217
7725
|
|
7218
7726
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7219
|
-
struct ggml_tensor * KQ_mask =
|
7220
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7727
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7221
7728
|
|
7222
7729
|
for (int il = 0; il < n_layer; ++il) {
|
7223
7730
|
struct ggml_tensor * inpSA = inpL;
|
@@ -7269,7 +7776,6 @@ struct llm_build_context {
|
|
7269
7776
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7270
7777
|
model.layers[il].wo, model.layers[il].bo,
|
7271
7778
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7272
|
-
cb(cur, "kqv_out", il);
|
7273
7779
|
}
|
7274
7780
|
|
7275
7781
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
@@ -7331,20 +7837,17 @@ struct llm_build_context {
|
|
7331
7837
|
struct ggml_tensor * cur;
|
7332
7838
|
struct ggml_tensor * inpL;
|
7333
7839
|
|
7334
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
7335
|
-
cb(inpL, "inp_embd", -1);
|
7840
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7336
7841
|
|
7337
7842
|
// scale the input embeddings
|
7338
7843
|
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
7339
7844
|
cb(inpL, "inp_scaled", -1);
|
7340
7845
|
|
7341
7846
|
// inp_pos - contains the positions
|
7342
|
-
struct ggml_tensor * inp_pos =
|
7343
|
-
cb(inp_pos, "inp_pos", -1);
|
7847
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7344
7848
|
|
7345
7849
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7346
|
-
struct ggml_tensor * KQ_mask =
|
7347
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7850
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7348
7851
|
|
7349
7852
|
for (int il = 0; il < n_layer; ++il) {
|
7350
7853
|
struct ggml_tensor * inpSA = inpL;
|
@@ -7396,7 +7899,6 @@ struct llm_build_context {
|
|
7396
7899
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7397
7900
|
model.layers[il].wo, model.layers[il].bo,
|
7398
7901
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7399
|
-
cb(cur, "kqv_out", il);
|
7400
7902
|
}
|
7401
7903
|
|
7402
7904
|
// scale_res - scale the hidden states for residual connection
|
@@ -7463,22 +7965,18 @@ struct llm_build_context {
|
|
7463
7965
|
struct ggml_tensor * cur;
|
7464
7966
|
struct ggml_tensor * inpL;
|
7465
7967
|
|
7466
|
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd,
|
7467
|
-
cb(inpL, "inp_embd", -1);
|
7968
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7468
7969
|
|
7469
7970
|
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
7470
7971
|
cb(inpL, "inp_scaled", -1);
|
7471
7972
|
|
7472
7973
|
// inp_pos - contains the positions
|
7473
|
-
struct ggml_tensor * inp_pos =
|
7474
|
-
cb(inp_pos, "inp_pos", -1);
|
7974
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7475
7975
|
|
7476
7976
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7477
|
-
struct ggml_tensor * KQ_mask =
|
7478
|
-
cb(KQ_mask, "KQ_mask", -1);
|
7977
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7479
7978
|
|
7480
7979
|
for (int il = 0; il < n_layer; ++il) {
|
7481
|
-
|
7482
7980
|
// norm
|
7483
7981
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
7484
7982
|
model.layers[il].attn_norm, NULL,
|
@@ -7515,7 +8013,6 @@ struct llm_build_context {
|
|
7515
8013
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7516
8014
|
model.layers[il].wo, NULL,
|
7517
8015
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
7518
|
-
cb(cur, "kqv_out", il);
|
7519
8016
|
}
|
7520
8017
|
|
7521
8018
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
@@ -7559,6 +8056,255 @@ struct llm_build_context {
|
|
7559
8056
|
|
7560
8057
|
return gf;
|
7561
8058
|
}
|
8059
|
+
|
8060
|
+
struct ggml_cgraph * build_starcoder2() {
|
8061
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8062
|
+
|
8063
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8064
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8065
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
8066
|
+
|
8067
|
+
struct ggml_tensor * cur;
|
8068
|
+
struct ggml_tensor * inpL;
|
8069
|
+
|
8070
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8071
|
+
|
8072
|
+
// inp_pos - contains the positions
|
8073
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8074
|
+
|
8075
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8076
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8077
|
+
|
8078
|
+
for (int il = 0; il < n_layer; ++il) {
|
8079
|
+
struct ggml_tensor * inpSA = inpL;
|
8080
|
+
|
8081
|
+
// norm
|
8082
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8083
|
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
8084
|
+
LLM_NORM, cb, il);
|
8085
|
+
cb(cur, "attn_norm", il);
|
8086
|
+
|
8087
|
+
// self-attention
|
8088
|
+
{
|
8089
|
+
// compute Q and K and RoPE them
|
8090
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8091
|
+
cb(Qcur, "Qcur", il);
|
8092
|
+
if (model.layers[il].bq) {
|
8093
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8094
|
+
cb(Qcur, "Qcur", il);
|
8095
|
+
}
|
8096
|
+
|
8097
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8098
|
+
cb(Kcur, "Kcur", il);
|
8099
|
+
if (model.layers[il].bk) {
|
8100
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8101
|
+
cb(Kcur, "Kcur", il);
|
8102
|
+
}
|
8103
|
+
|
8104
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8105
|
+
cb(Vcur, "Vcur", il);
|
8106
|
+
if (model.layers[il].bv) {
|
8107
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8108
|
+
cb(Vcur, "Vcur", il);
|
8109
|
+
}
|
8110
|
+
|
8111
|
+
Qcur = ggml_rope_custom(
|
8112
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8113
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8114
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8115
|
+
);
|
8116
|
+
cb(Qcur, "Qcur", il);
|
8117
|
+
|
8118
|
+
Kcur = ggml_rope_custom(
|
8119
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8120
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8121
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8122
|
+
);
|
8123
|
+
cb(Kcur, "Kcur", il);
|
8124
|
+
|
8125
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8126
|
+
model.layers[il].wo, model.layers[il].bo,
|
8127
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8128
|
+
cb(cur, "kqv_out", il);
|
8129
|
+
}
|
8130
|
+
|
8131
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8132
|
+
cb(ffn_inp, "ffn_inp", il);
|
8133
|
+
|
8134
|
+
// feed-forward network
|
8135
|
+
|
8136
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8137
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
8138
|
+
LLM_NORM, cb, il);
|
8139
|
+
cb(cur, "ffn_norm", il);
|
8140
|
+
|
8141
|
+
cur = llm_build_ffn(ctx0, cur,
|
8142
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
8143
|
+
NULL, NULL,
|
8144
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8145
|
+
NULL,
|
8146
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
8147
|
+
cb(cur, "ffn_out", il);
|
8148
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
8149
|
+
cb(cur, "l_out", il);
|
8150
|
+
|
8151
|
+
// input for next layer
|
8152
|
+
inpL = cur;
|
8153
|
+
}
|
8154
|
+
|
8155
|
+
cur = inpL;
|
8156
|
+
|
8157
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
8158
|
+
model.output_norm, model.output_norm_b,
|
8159
|
+
LLM_NORM, cb, -1);
|
8160
|
+
cb(cur, "result_norm", -1);
|
8161
|
+
|
8162
|
+
// lm_head
|
8163
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8164
|
+
cb(cur, "result_output", -1);
|
8165
|
+
|
8166
|
+
ggml_build_forward_expand(gf, cur);
|
8167
|
+
|
8168
|
+
return gf;
|
8169
|
+
}
|
8170
|
+
|
8171
|
+
struct ggml_cgraph * build_mamba() {
|
8172
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8173
|
+
|
8174
|
+
const int64_t d_model = n_embd;
|
8175
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
8176
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
8177
|
+
GGML_ASSERT(2 * d_model == d_inner);
|
8178
|
+
const int64_t d_state = hparams.ssm_d_state;
|
8179
|
+
const int64_t dt_rank = hparams.ssm_dt_rank;
|
8180
|
+
|
8181
|
+
struct ggml_tensor * cur;
|
8182
|
+
struct ggml_tensor * inpL;
|
8183
|
+
|
8184
|
+
// {n_embd, n_tokens}
|
8185
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8186
|
+
|
8187
|
+
struct ggml_tensor * state_mask = build_inp_s_mask();
|
8188
|
+
struct ggml_tensor * state_seq = build_inp_s_seq();
|
8189
|
+
|
8190
|
+
for (int il = 0; il < n_layer; ++il) {
|
8191
|
+
// (ab)using the KV cache to store the states
|
8192
|
+
struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
|
8193
|
+
struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
|
8194
|
+
|
8195
|
+
// clear states of sequences which are starting at the beginning of this batch
|
8196
|
+
{
|
8197
|
+
conv_states = ggml_mul(ctx0,
|
8198
|
+
ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]),
|
8199
|
+
state_mask);
|
8200
|
+
ssm_states = ggml_mul(ctx0,
|
8201
|
+
ggml_view_2d(ctx0, ssm_states, ssm_states->ne[0], n_kv, ssm_states->nb[1], kv_head*ssm_states->nb[1]),
|
8202
|
+
state_mask);
|
8203
|
+
}
|
8204
|
+
|
8205
|
+
conv_states = ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv);
|
8206
|
+
ssm_states = ggml_reshape_3d(ctx0, ssm_states, d_state, d_inner, n_kv);
|
8207
|
+
|
8208
|
+
// norm
|
8209
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8210
|
+
model.layers[il].attn_norm, NULL,
|
8211
|
+
LLM_NORM_RMS, cb, il);
|
8212
|
+
cb(cur, "attn_norm", il);
|
8213
|
+
|
8214
|
+
// {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
|
8215
|
+
struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
|
8216
|
+
// split the above in two
|
8217
|
+
// => {d_inner, n_tokens}
|
8218
|
+
struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
|
8219
|
+
struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
|
8220
|
+
|
8221
|
+
// conv
|
8222
|
+
{
|
8223
|
+
// Custom operator which is needed only to ease simultaneous sequence processing.
|
8224
|
+
// For a single sequence, the equivalent is to concatenate the columns of conv_states and x,
|
8225
|
+
// then make a self-overlapping view of that over d_conv columns at each stride in the 3rd dimension,
|
8226
|
+
// then element-wise multiply that with the conv1d weigth,
|
8227
|
+
// then sum the elements of each row,
|
8228
|
+
// (the last two steps are a dot product over rows (also doable with mul_mat))
|
8229
|
+
// then permute away the ne[0] dimension,
|
8230
|
+
// and then you're left with the resulting x tensor.
|
8231
|
+
// The new conv_states is the last (d_conv - 1) columns
|
8232
|
+
// of the last 3rd dimensional "layer" of the self-overlapping view.
|
8233
|
+
// For simultaneous sequences, it's more complicated.
|
8234
|
+
struct ggml_tensor * x_conv = ggml_ssm_conv(ctx0, conv_states, x, model.layers[il].ssm_conv1d, state_seq);
|
8235
|
+
|
8236
|
+
// store last (d_conv - 1) columns of the conv_state part of x_conv back into the KV cache
|
8237
|
+
ggml_build_forward_expand(gf,
|
8238
|
+
ggml_cpy(ctx0,
|
8239
|
+
ggml_view_2d(ctx0, x_conv, d_conv - 1, d_inner*n_kv, d_conv*ggml_element_size(x_conv), (1+d_inner*n_tokens)*ggml_element_size(x_conv)),
|
8240
|
+
ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(x_conv))));
|
8241
|
+
|
8242
|
+
// extract x from x_conv
|
8243
|
+
x = ggml_view_2d(ctx0, x_conv, d_inner, n_tokens, d_inner*ggml_element_size(x_conv), 0);
|
8244
|
+
|
8245
|
+
// bias
|
8246
|
+
x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
|
8247
|
+
|
8248
|
+
x = ggml_silu(ctx0, x);
|
8249
|
+
}
|
8250
|
+
|
8251
|
+
// ssm
|
8252
|
+
{
|
8253
|
+
// {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
|
8254
|
+
struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
|
8255
|
+
// split
|
8256
|
+
struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
|
8257
|
+
struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
|
8258
|
+
struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
|
8259
|
+
|
8260
|
+
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
|
8261
|
+
dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
|
8262
|
+
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
|
8263
|
+
|
8264
|
+
// Custom operator to optimize the parallel associative scan
|
8265
|
+
// as described in the Annex D of the Mamba paper.
|
8266
|
+
// => {d_inner, n_tokens} and {d_state, d_inner, n_kv} combined,
|
8267
|
+
// because only a single tensor can be returned.
|
8268
|
+
struct ggml_tensor * y_ssm_states = ggml_ssm_scan(ctx0, ssm_states, x, dt, model.layers[il].ssm_a, B, C, state_seq);
|
8269
|
+
|
8270
|
+
// store last states (the second part of y_ssm_states)
|
8271
|
+
ggml_build_forward_expand(gf,
|
8272
|
+
ggml_cpy(ctx0,
|
8273
|
+
ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*ggml_element_size(y_ssm_states)),
|
8274
|
+
ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_head*d_state*d_inner*ggml_element_size(ssm_states))));
|
8275
|
+
|
8276
|
+
struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
|
8277
|
+
|
8278
|
+
// {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
|
8279
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
8280
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
|
8281
|
+
|
8282
|
+
// {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
|
8283
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
|
8284
|
+
}
|
8285
|
+
|
8286
|
+
// residual
|
8287
|
+
cur = ggml_add(ctx0, cur, inpL);
|
8288
|
+
cb(cur, "l_out", il);
|
8289
|
+
|
8290
|
+
// input for next layer
|
8291
|
+
inpL = cur;
|
8292
|
+
}
|
8293
|
+
|
8294
|
+
// final rmsnorm
|
8295
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8296
|
+
model.output_norm, NULL,
|
8297
|
+
LLM_NORM_RMS, cb, -1);
|
8298
|
+
cb(cur, "result_norm", -1);
|
8299
|
+
|
8300
|
+
// lm_head
|
8301
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8302
|
+
cb(cur, "result_output", -1);
|
8303
|
+
|
8304
|
+
ggml_build_forward_expand(gf, cur);
|
8305
|
+
|
8306
|
+
return gf;
|
8307
|
+
}
|
7562
8308
|
};
|
7563
8309
|
|
7564
8310
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -7595,6 +8341,23 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
|
7595
8341
|
return result;
|
7596
8342
|
}
|
7597
8343
|
|
8344
|
+
static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
|
8345
|
+
llama_batch dummy;
|
8346
|
+
dummy.n_tokens = 0;
|
8347
|
+
|
8348
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
8349
|
+
|
8350
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
8351
|
+
|
8352
|
+
llm.init();
|
8353
|
+
|
8354
|
+
struct ggml_cgraph * result = llm.build_s_copy();
|
8355
|
+
|
8356
|
+
llm.free();
|
8357
|
+
|
8358
|
+
return result;
|
8359
|
+
}
|
8360
|
+
|
7598
8361
|
static struct ggml_cgraph * llama_build_graph(
|
7599
8362
|
llama_context & lctx,
|
7600
8363
|
const llama_batch & batch,
|
@@ -7612,7 +8375,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7612
8375
|
if (!lctx.cparams.offload_kqv) {
|
7613
8376
|
if (strcmp(name, "kqv_merged_cont") == 0) {
|
7614
8377
|
// all nodes between the KV store and the attention output are run on the CPU
|
7615
|
-
|
8378
|
+
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu);
|
8379
|
+
}
|
8380
|
+
}
|
8381
|
+
|
8382
|
+
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
8383
|
+
// to fix this, we assign the norm layer manually to the backend of its layer
|
8384
|
+
if (il != -1 && strcmp(name, "norm") == 0) {
|
8385
|
+
for (auto * backend : lctx.backends) {
|
8386
|
+
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
8387
|
+
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
8388
|
+
break;
|
8389
|
+
}
|
7616
8390
|
}
|
7617
8391
|
}
|
7618
8392
|
};
|
@@ -7705,6 +8479,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7705
8479
|
{
|
7706
8480
|
result = llm.build_gemma();
|
7707
8481
|
} break;
|
8482
|
+
case LLM_ARCH_STARCODER2:
|
8483
|
+
{
|
8484
|
+
result = llm.build_starcoder2();
|
8485
|
+
} break;
|
8486
|
+
case LLM_ARCH_MAMBA:
|
8487
|
+
{
|
8488
|
+
result = llm.build_mamba();
|
8489
|
+
} break;
|
7708
8490
|
default:
|
7709
8491
|
GGML_ASSERT(false);
|
7710
8492
|
}
|
@@ -7715,19 +8497,29 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7715
8497
|
}
|
7716
8498
|
|
7717
8499
|
static void llama_set_k_shift(llama_context & lctx) {
|
7718
|
-
const
|
7719
|
-
|
7720
|
-
const int64_t n_ctx = cparams.n_ctx;
|
8500
|
+
const int64_t kv_size = lctx.kv_self.size;
|
7721
8501
|
|
7722
8502
|
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7723
8503
|
|
7724
8504
|
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7725
8505
|
|
7726
|
-
for (int i = 0; i <
|
8506
|
+
for (int i = 0; i < kv_size; ++i) {
|
7727
8507
|
data[i] = lctx.kv_self.cells[i].delta;
|
7728
8508
|
}
|
7729
8509
|
}
|
7730
8510
|
|
8511
|
+
static void llama_set_s_copy(llama_context & lctx) {
|
8512
|
+
const int64_t kv_size = lctx.kv_self.size;
|
8513
|
+
|
8514
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
|
8515
|
+
|
8516
|
+
int32_t * data = (int32_t *) lctx.inp_s_copy->data;
|
8517
|
+
|
8518
|
+
for (int i = 0; i < kv_size; ++i) {
|
8519
|
+
data[i] = lctx.kv_self.cells[i].src;
|
8520
|
+
}
|
8521
|
+
}
|
8522
|
+
|
7731
8523
|
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7732
8524
|
//
|
7733
8525
|
// set input data
|
@@ -7750,34 +8542,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7750
8542
|
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
7751
8543
|
}
|
7752
8544
|
|
7753
|
-
if (batch.pos) {
|
8545
|
+
if (batch.pos && lctx.inp_pos) {
|
7754
8546
|
const int64_t n_tokens = batch.n_tokens;
|
7755
8547
|
|
7756
8548
|
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
7757
8549
|
}
|
7758
8550
|
|
7759
|
-
|
7760
|
-
|
7761
|
-
|
8551
|
+
GGML_ASSERT(
|
8552
|
+
(hparams.causal_attn || !cparams.causal_attn) &&
|
8553
|
+
"non-causal attention with generative models is not supported"
|
8554
|
+
);
|
7762
8555
|
|
7763
|
-
|
8556
|
+
if (lctx.inp_KQ_mask) {
|
8557
|
+
// NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
|
8558
|
+
if (cparams.causal_attn) {
|
8559
|
+
const int64_t n_kv = kv_self.n;
|
8560
|
+
const int64_t n_tokens = batch.n_tokens;
|
7764
8561
|
|
7765
|
-
|
8562
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
7766
8563
|
|
7767
|
-
|
7768
|
-
for (int j = 0; j < n_tokens; ++j) {
|
7769
|
-
const llama_pos pos = batch.pos[j];
|
7770
|
-
const llama_seq_id seq_id = batch.seq_id[j][0];
|
8564
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
7771
8565
|
|
7772
|
-
|
7773
|
-
|
7774
|
-
|
7775
|
-
|
7776
|
-
|
7777
|
-
|
7778
|
-
|
8566
|
+
// For causal attention, use only the previous KV cells
|
8567
|
+
// of the correct sequence for each token of the batch.
|
8568
|
+
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
8569
|
+
for (int h = 0; h < 1; ++h) {
|
8570
|
+
for (int j = 0; j < n_tokens; ++j) {
|
8571
|
+
const llama_pos pos = batch.pos[j];
|
8572
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
8573
|
+
|
8574
|
+
for (int i = 0; i < n_kv; ++i) {
|
8575
|
+
float f;
|
8576
|
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
8577
|
+
f = -INFINITY;
|
8578
|
+
} else {
|
8579
|
+
f = 0.0f;
|
8580
|
+
}
|
8581
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
8582
|
+
}
|
8583
|
+
}
|
8584
|
+
}
|
8585
|
+
} else {
|
8586
|
+
// when using kv cache, the mask needs to match the kv cache size
|
8587
|
+
const int64_t n_tokens = batch.n_tokens;
|
8588
|
+
const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens;
|
8589
|
+
|
8590
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
8591
|
+
|
8592
|
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
8593
|
+
|
8594
|
+
for (int h = 0; h < 1; ++h) {
|
8595
|
+
for (int j = 0; j < n_tokens; ++j) {
|
8596
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
8597
|
+
|
8598
|
+
for (int i = 0; i < n_tokens; ++i) {
|
8599
|
+
float f = -INFINITY;
|
8600
|
+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
8601
|
+
if (batch.seq_id[i][s] == seq_id) {
|
8602
|
+
f = 0.0f;
|
8603
|
+
break;
|
8604
|
+
}
|
8605
|
+
}
|
8606
|
+
|
8607
|
+
data[h*(n_tokens*n_tokens) + j*n_stride + i] = f;
|
8608
|
+
}
|
8609
|
+
|
8610
|
+
for (int i = n_tokens; i < n_stride; ++i) {
|
8611
|
+
data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY;
|
7779
8612
|
}
|
7780
|
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
7781
8613
|
}
|
7782
8614
|
}
|
7783
8615
|
}
|
@@ -7786,7 +8618,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7786
8618
|
if (hparams.need_kq_pos) {
|
7787
8619
|
const int64_t n_kv = kv_self.n;
|
7788
8620
|
|
7789
|
-
|
8621
|
+
GGML_ASSERT(lctx.inp_KQ_pos);
|
8622
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
7790
8623
|
|
7791
8624
|
float * data = (float *) lctx.inp_KQ_pos->data;
|
7792
8625
|
|
@@ -7795,17 +8628,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7795
8628
|
}
|
7796
8629
|
}
|
7797
8630
|
|
7798
|
-
if (cparams.
|
8631
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
7799
8632
|
const int64_t n_tokens = batch.n_tokens;
|
7800
8633
|
|
8634
|
+
GGML_ASSERT(lctx.inp_mean);
|
7801
8635
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
7802
|
-
float * data = (float *) lctx.inp_mean->data;
|
7803
8636
|
|
8637
|
+
float * data = (float *) lctx.inp_mean->data;
|
7804
8638
|
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
7805
8639
|
|
7806
8640
|
std::vector<uint64_t> sum(n_tokens, 0);
|
7807
8641
|
for (int i = 0; i < n_tokens; ++i) {
|
7808
8642
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
8643
|
+
|
8644
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
8645
|
+
|
7809
8646
|
sum[seq_id] += 1;
|
7810
8647
|
}
|
7811
8648
|
|
@@ -7823,20 +8660,73 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7823
8660
|
}
|
7824
8661
|
}
|
7825
8662
|
|
7826
|
-
if (cparams.
|
8663
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
7827
8664
|
const int64_t n_tokens = batch.n_tokens;
|
7828
8665
|
|
8666
|
+
GGML_ASSERT(lctx.inp_cls);
|
7829
8667
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
8668
|
+
|
7830
8669
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
8670
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
7831
8671
|
|
7832
8672
|
for (int i = 0; i < n_tokens; ++i) {
|
7833
8673
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
7834
|
-
const llama_pos
|
8674
|
+
const llama_pos pos = batch.pos[i];
|
8675
|
+
|
8676
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
8677
|
+
|
7835
8678
|
if (pos == 0) {
|
7836
8679
|
data[seq_id] = i;
|
7837
8680
|
}
|
7838
8681
|
}
|
7839
8682
|
}
|
8683
|
+
|
8684
|
+
if (kv_self.recurrent) {
|
8685
|
+
const int64_t n_kv = kv_self.n;
|
8686
|
+
|
8687
|
+
if (lctx.inp_s_mask) {
|
8688
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
|
8689
|
+
float * data = (float *) lctx.inp_s_mask->data;
|
8690
|
+
|
8691
|
+
// states which are not affected by the current batch are left untouched
|
8692
|
+
for (int i = 0; i < n_kv; ++i) {
|
8693
|
+
llama_seq_id seq_id = i + lctx.kv_self.head;
|
8694
|
+
llama_kv_cell & kv_cell = lctx.kv_self.cells[seq_id];
|
8695
|
+
bool has_self_seq = kv_cell.has_seq_id(seq_id);
|
8696
|
+
|
8697
|
+
data[i] = (float) has_self_seq;
|
8698
|
+
|
8699
|
+
// ensure current sequences will be kept
|
8700
|
+
if (!has_self_seq && kv_cell.pos >= 0) {
|
8701
|
+
kv_cell.seq_id.insert(seq_id);
|
8702
|
+
}
|
8703
|
+
}
|
8704
|
+
}
|
8705
|
+
// For Mamba (and other recurrent architectures),
|
8706
|
+
// update the correct state(s)/sequence(s) for each token of the batch.
|
8707
|
+
// Like with the KQ_mask, if a token in the batch has multiple sequences,
|
8708
|
+
// they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv).
|
8709
|
+
if (lctx.inp_s_seq) {
|
8710
|
+
const int64_t n_tokens = batch.n_tokens;
|
8711
|
+
|
8712
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer));
|
8713
|
+
int32_t * data = (int32_t *) lctx.inp_s_seq->data;
|
8714
|
+
|
8715
|
+
for (int j = 0; j < n_tokens; ++j) {
|
8716
|
+
const int32_t n_seq = batch.n_seq_id[j];
|
8717
|
+
GGML_ASSERT(0 < n_seq); // a token should be part of at least 1 sequence
|
8718
|
+
|
8719
|
+
for (int i = 0; i < n_kv; ++i) {
|
8720
|
+
if (i < n_seq) {
|
8721
|
+
// for this type of model, the head is the minimum seq_id of the batch
|
8722
|
+
data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head;
|
8723
|
+
} else {
|
8724
|
+
data[j*n_kv + i] = -1;
|
8725
|
+
}
|
8726
|
+
}
|
8727
|
+
}
|
8728
|
+
}
|
8729
|
+
}
|
7840
8730
|
}
|
7841
8731
|
|
7842
8732
|
static void llama_graph_compute(
|
@@ -7856,9 +8746,10 @@ static void llama_graph_compute(
|
|
7856
8746
|
|
7857
8747
|
if (lctx.backend_cpu != nullptr) {
|
7858
8748
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
8749
|
+
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
7859
8750
|
}
|
7860
8751
|
|
7861
|
-
|
8752
|
+
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
7862
8753
|
|
7863
8754
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
7864
8755
|
|
@@ -7878,10 +8769,11 @@ static void llama_graph_compute(
|
|
7878
8769
|
//
|
7879
8770
|
static int llama_decode_internal(
|
7880
8771
|
llama_context & lctx,
|
7881
|
-
llama_batch
|
7882
|
-
|
8772
|
+
llama_batch batch_all) { // TODO: rename back to batch
|
8773
|
+
|
8774
|
+
const uint32_t n_tokens_all = batch_all.n_tokens;
|
7883
8775
|
|
7884
|
-
if (
|
8776
|
+
if (n_tokens_all == 0) {
|
7885
8777
|
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
7886
8778
|
return -1;
|
7887
8779
|
}
|
@@ -7890,14 +8782,16 @@ static int llama_decode_internal(
|
|
7890
8782
|
const auto & hparams = model.hparams;
|
7891
8783
|
const auto & cparams = lctx.cparams;
|
7892
8784
|
|
7893
|
-
|
8785
|
+
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
|
7894
8786
|
|
7895
|
-
GGML_ASSERT(
|
7896
|
-
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
8787
|
+
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
7897
8788
|
|
7898
|
-
|
8789
|
+
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
7899
8790
|
|
7900
|
-
|
8791
|
+
if (lctx.t_compute_start_us == 0) {
|
8792
|
+
lctx.t_compute_start_us = ggml_time_us();
|
8793
|
+
}
|
8794
|
+
lctx.n_queued_tokens += n_tokens_all;
|
7901
8795
|
|
7902
8796
|
#ifdef GGML_USE_MPI
|
7903
8797
|
// TODO: needs fix after #3228
|
@@ -7905,213 +8799,274 @@ static int llama_decode_internal(
|
|
7905
8799
|
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
7906
8800
|
#endif
|
7907
8801
|
|
7908
|
-
GGML_ASSERT(n_threads > 0);
|
7909
|
-
|
7910
8802
|
auto & kv_self = lctx.kv_self;
|
7911
8803
|
|
7912
8804
|
const int64_t n_embd = hparams.n_embd;
|
7913
8805
|
const int64_t n_vocab = hparams.n_vocab;
|
7914
8806
|
|
7915
|
-
// helpers for smoother batch API transition
|
7916
|
-
// after deprecating the llama_eval calls, these will be removed
|
7917
|
-
std::vector<llama_pos> pos;
|
7918
8807
|
|
7919
|
-
|
7920
|
-
std::vector<llama_seq_id *> seq_id_arr;
|
7921
|
-
std::vector<std::vector<llama_seq_id>> seq_id;
|
8808
|
+
auto * logits_out = lctx.logits;
|
7922
8809
|
|
7923
|
-
|
7924
|
-
|
7925
|
-
|
7926
|
-
|
7927
|
-
}
|
8810
|
+
#ifndef NDEBUG
|
8811
|
+
auto & logits_valid = lctx.logits_valid;
|
8812
|
+
logits_valid.clear();
|
8813
|
+
logits_valid.resize(n_tokens_all);
|
7928
8814
|
|
7929
|
-
|
7930
|
-
|
8815
|
+
memset(logits_out, 0, lctx.logits_size*sizeof(float));
|
8816
|
+
#endif
|
7931
8817
|
|
7932
|
-
|
7933
|
-
n_seq_id.resize(n_tokens);
|
7934
|
-
seq_id.resize(n_tokens);
|
7935
|
-
seq_id_arr.resize(n_tokens);
|
7936
|
-
for (uint32_t i = 0; i < n_tokens; i++) {
|
7937
|
-
n_seq_id[i] = 1;
|
7938
|
-
seq_id[i].resize(1);
|
7939
|
-
seq_id[i][0] = batch.all_seq_id;
|
7940
|
-
seq_id_arr[i] = seq_id[i].data();
|
7941
|
-
}
|
8818
|
+
const auto n_ubatch = cparams.n_ubatch;
|
7942
8819
|
|
7943
|
-
|
7944
|
-
|
7945
|
-
|
8820
|
+
std::vector<llama_pos> pos;
|
8821
|
+
std::vector<int32_t> n_seq_id;
|
8822
|
+
std::vector<llama_seq_id *> seq_id_arr;
|
8823
|
+
std::vector<std::vector<llama_seq_id>> seq_id;
|
7946
8824
|
|
7947
|
-
|
8825
|
+
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
8826
|
+
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
8827
|
+
llama_batch u_batch = {
|
8828
|
+
/* .n_tokens = */ (int32_t) n_tokens,
|
8829
|
+
/* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr,
|
8830
|
+
/* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr,
|
8831
|
+
/* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr,
|
8832
|
+
/* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr,
|
8833
|
+
/* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr,
|
8834
|
+
/* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr,
|
8835
|
+
/* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
|
8836
|
+
/* .all_pos_1 = */ batch_all.all_pos_1,
|
8837
|
+
/* .all_seq_id = */ batch_all.all_seq_id,
|
8838
|
+
};
|
7948
8839
|
|
7949
|
-
|
7950
|
-
|
7951
|
-
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
7952
|
-
kv_self.head = 0;
|
7953
|
-
}
|
8840
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
8841
|
+
GGML_ASSERT(n_threads > 0);
|
7954
8842
|
|
7955
|
-
|
7956
|
-
|
7957
|
-
|
8843
|
+
// helpers for smoother batch API transition
|
8844
|
+
// after deprecating the llama_eval calls, these will be removed
|
8845
|
+
if (u_batch.pos == nullptr) {
|
8846
|
+
pos.resize(n_tokens);
|
8847
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8848
|
+
pos[i] = u_batch.all_pos_0 + i*u_batch.all_pos_1;
|
8849
|
+
}
|
7958
8850
|
|
7959
|
-
|
7960
|
-
|
7961
|
-
// if we start defragmenting the cache, the benefit from this will be more important
|
7962
|
-
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
7963
|
-
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
8851
|
+
u_batch.pos = pos.data();
|
8852
|
+
}
|
7964
8853
|
|
7965
|
-
|
8854
|
+
if (u_batch.seq_id == nullptr) {
|
8855
|
+
n_seq_id.resize(n_tokens);
|
8856
|
+
seq_id.resize(n_tokens);
|
8857
|
+
seq_id_arr.resize(n_tokens);
|
8858
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8859
|
+
n_seq_id[i] = 1;
|
8860
|
+
seq_id[i].resize(1);
|
8861
|
+
seq_id[i][0] = u_batch.all_seq_id;
|
8862
|
+
seq_id_arr[i] = seq_id[i].data();
|
8863
|
+
}
|
7966
8864
|
|
7967
|
-
|
7968
|
-
|
8865
|
+
u_batch.n_seq_id = n_seq_id.data();
|
8866
|
+
u_batch.seq_id = seq_id_arr.data();
|
8867
|
+
}
|
7969
8868
|
|
7970
|
-
|
8869
|
+
// non-causal masks do not use the KV cache
|
8870
|
+
if (hparams.causal_attn) {
|
8871
|
+
llama_kv_cache_update(&lctx);
|
7971
8872
|
|
7972
|
-
|
7973
|
-
|
7974
|
-
|
8873
|
+
// if we have enough unused cells before the current head ->
|
8874
|
+
// better to start searching from the beginning of the cache, hoping to fill it
|
8875
|
+
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
8876
|
+
kv_self.head = 0;
|
8877
|
+
}
|
8878
|
+
|
8879
|
+
if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
|
8880
|
+
return 1;
|
8881
|
+
}
|
7975
8882
|
|
7976
|
-
|
7977
|
-
|
7978
|
-
|
7979
|
-
|
7980
|
-
|
8883
|
+
if (!kv_self.recurrent) {
|
8884
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
8885
|
+
// after enough generations, the benefit from this heuristic disappears
|
8886
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
8887
|
+
kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
8888
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
8889
|
+
}
|
7981
8890
|
}
|
7982
|
-
} else if (strcmp(res->name, "result_embd") == 0) {
|
7983
|
-
embeddings = res;
|
7984
|
-
res = nullptr;
|
7985
|
-
} else {
|
7986
|
-
GGML_ASSERT(false);
|
7987
|
-
}
|
7988
8891
|
|
7989
|
-
|
8892
|
+
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
7990
8893
|
|
7991
|
-
|
7992
|
-
|
7993
|
-
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
7994
|
-
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
7995
|
-
// with the BLAS calls. need a better solution
|
7996
|
-
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
7997
|
-
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
7998
|
-
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
7999
|
-
n_threads = std::min(4, n_threads);
|
8000
|
-
}
|
8894
|
+
ggml_backend_sched_reset(lctx.sched);
|
8895
|
+
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
8001
8896
|
|
8002
|
-
|
8897
|
+
ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false);
|
8003
8898
|
|
8004
|
-
|
8899
|
+
// the output is always the last tensor in the graph
|
8900
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
8901
|
+
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
8005
8902
|
|
8006
|
-
|
8007
|
-
|
8008
|
-
kv_self.head += n_tokens;
|
8903
|
+
if (!hparams.causal_attn) {
|
8904
|
+
res = nullptr; // do not extract logits for embedding models such as BERT
|
8009
8905
|
|
8010
|
-
|
8011
|
-
|
8012
|
-
kv_self.head = 0;
|
8013
|
-
}
|
8014
|
-
}
|
8015
|
-
|
8016
|
-
// decide if we need to defrag the kv cache
|
8017
|
-
if (cparams.defrag_thold >= 0.0f) {
|
8018
|
-
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
|
8906
|
+
// token or sequence embeddings
|
8907
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
8019
8908
|
|
8020
|
-
|
8021
|
-
|
8022
|
-
|
8909
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
8910
|
+
} else {
|
8911
|
+
if (strcmp(res->name, "result_output") == 0) {
|
8912
|
+
// the token embeddings could be the second to last tensor, or the third to last tensor
|
8913
|
+
if (strcmp(embd->name, "result_norm") != 0) {
|
8914
|
+
embd = gf->nodes[gf->n_nodes - 3];
|
8915
|
+
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
|
8916
|
+
}
|
8917
|
+
} else {
|
8918
|
+
GGML_ASSERT(false && "missing result_output tensor");
|
8919
|
+
}
|
8920
|
+
}
|
8921
|
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
8023
8922
|
|
8024
|
-
|
8923
|
+
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
8924
|
+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
8925
|
+
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
8926
|
+
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
8927
|
+
// with the BLAS calls. need a better solution
|
8928
|
+
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
8929
|
+
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
8930
|
+
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
8931
|
+
n_threads = std::min(4, n_threads);
|
8025
8932
|
}
|
8026
|
-
}
|
8027
8933
|
|
8028
|
-
|
8029
|
-
// print timing information per ggml operation (for debugging purposes)
|
8030
|
-
// requires GGML_PERF to be defined
|
8031
|
-
ggml_graph_print(gf);
|
8032
|
-
#endif
|
8934
|
+
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
8033
8935
|
|
8034
|
-
|
8035
|
-
//if (n_past%100 == 0) {
|
8036
|
-
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
8037
|
-
//}
|
8936
|
+
llama_set_inputs(lctx, u_batch);
|
8038
8937
|
|
8039
|
-
|
8040
|
-
// TODO: do not compute and extract logits if only embeddings are needed
|
8041
|
-
// need to update the graphs to skip "result_output"
|
8042
|
-
if (res) {
|
8043
|
-
auto & logits_out = lctx.logits;
|
8938
|
+
llama_graph_compute(lctx, gf, n_threads);
|
8044
8939
|
|
8045
|
-
|
8046
|
-
|
8047
|
-
|
8048
|
-
|
8940
|
+
// update the kv ring buffer
|
8941
|
+
{
|
8942
|
+
kv_self.head += n_tokens;
|
8943
|
+
|
8944
|
+
// Ensure kv cache head points to a valid index.
|
8945
|
+
if (kv_self.head >= kv_self.size) {
|
8946
|
+
kv_self.head = 0;
|
8947
|
+
}
|
8948
|
+
}
|
8049
8949
|
|
8050
|
-
|
8950
|
+
#ifdef GGML_PERF
|
8951
|
+
// print timing information per ggml operation (for debugging purposes)
|
8952
|
+
// requires GGML_PERF to be defined
|
8953
|
+
ggml_graph_print(gf);
|
8051
8954
|
#endif
|
8052
8955
|
|
8053
|
-
|
8054
|
-
|
8055
|
-
|
8056
|
-
|
8057
|
-
|
8058
|
-
|
8059
|
-
|
8060
|
-
|
8061
|
-
|
8956
|
+
// plot the computation graph in dot format (for debugging purposes)
|
8957
|
+
//if (n_past%100 == 0) {
|
8958
|
+
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
8959
|
+
//}
|
8960
|
+
|
8961
|
+
// extract logits
|
8962
|
+
// TODO: do not compute and extract logits if only embeddings are needed
|
8963
|
+
// update the graphs to skip "result_output" if logits are not needed
|
8964
|
+
if (res) {
|
8965
|
+
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
8966
|
+
GGML_ASSERT(backend_res != nullptr);
|
8967
|
+
if (u_batch.logits) {
|
8968
|
+
int32_t i_first = -1;
|
8969
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
8970
|
+
if (u_batch.logits[i] && i_first == -1) {
|
8971
|
+
i_first = (int32_t) i;
|
8972
|
+
}
|
8973
|
+
if (u_batch.logits[i] == 0 || i == n_tokens - 1) {
|
8974
|
+
if (i_first != -1) {
|
8975
|
+
int i_last = u_batch.logits[i] == 0 ? i : i + 1;
|
8976
|
+
// extract logits for the range [i_first, i_last)
|
8977
|
+
// group the requests to minimize the number of calls to the backend
|
8978
|
+
ggml_backend_tensor_get_async(backend_res, res,
|
8979
|
+
logits_out + n_vocab*(cur_token + i_first),
|
8980
|
+
i_first*n_vocab*sizeof(float),
|
8981
|
+
(i_last - i_first)*n_vocab*sizeof(float));
|
8982
|
+
i_first = -1;
|
8983
|
+
}
|
8984
|
+
}
|
8062
8985
|
#ifndef NDEBUG
|
8063
|
-
|
8986
|
+
logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
|
8064
8987
|
#endif
|
8065
|
-
|
8066
|
-
|
8067
|
-
|
8068
|
-
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
8988
|
+
}
|
8989
|
+
} else if (lctx.logits_all) {
|
8990
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
|
8069
8991
|
#ifndef NDEBUG
|
8070
|
-
|
8992
|
+
std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
|
8071
8993
|
#endif
|
8072
|
-
|
8073
|
-
|
8074
|
-
|
8994
|
+
} else {
|
8995
|
+
if (cur_token + n_tokens >= n_tokens_all) {
|
8996
|
+
ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
|
8075
8997
|
#ifndef NDEBUG
|
8076
|
-
|
8998
|
+
logits_valid[0] = true;
|
8077
8999
|
#endif
|
9000
|
+
}
|
9001
|
+
}
|
8078
9002
|
}
|
8079
|
-
ggml_backend_synchronize(res_backend);
|
8080
|
-
}
|
8081
9003
|
|
8082
|
-
|
8083
|
-
|
8084
|
-
|
9004
|
+
// extract embeddings
|
9005
|
+
if (cparams.embeddings && embd) {
|
9006
|
+
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
9007
|
+
GGML_ASSERT(backend_embd != nullptr);
|
8085
9008
|
|
8086
|
-
|
8087
|
-
|
9009
|
+
switch (cparams.pooling_type) {
|
9010
|
+
case LLAMA_POOLING_TYPE_NONE:
|
9011
|
+
{
|
9012
|
+
// extract token embeddings
|
9013
|
+
auto & embd_out = lctx.embd;
|
9014
|
+
|
9015
|
+
if (u_batch.logits) {
|
9016
|
+
//embd_out.resize(n_embd * n_tokens);
|
9017
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
9018
|
+
if (u_batch.logits[i] == 0) {
|
9019
|
+
continue;
|
9020
|
+
}
|
9021
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
|
9022
|
+
}
|
9023
|
+
}
|
9024
|
+
} break;
|
9025
|
+
case LLAMA_POOLING_TYPE_CLS:
|
9026
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
9027
|
+
{
|
9028
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
8088
9029
|
|
8089
|
-
|
8090
|
-
|
8091
|
-
|
8092
|
-
ggml_backend_synchronize(embeddings_backend);
|
8093
|
-
}
|
9030
|
+
// extract sequence embeddings
|
9031
|
+
auto & embd_seq_out = lctx.embd_seq;
|
9032
|
+
embd_seq_out.clear();
|
8094
9033
|
|
8095
|
-
|
8096
|
-
|
8097
|
-
|
8098
|
-
|
8099
|
-
|
8100
|
-
|
8101
|
-
|
8102
|
-
|
9034
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
9035
|
+
const llama_seq_id seq_id = u_batch.seq_id[i][0];
|
9036
|
+
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
9037
|
+
continue;
|
9038
|
+
}
|
9039
|
+
embd_seq_out[seq_id].resize(n_embd);
|
9040
|
+
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
|
9041
|
+
}
|
9042
|
+
} break;
|
9043
|
+
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
9044
|
+
{
|
9045
|
+
GGML_ASSERT(false && "unknown pooling type");
|
9046
|
+
} break;
|
9047
|
+
}
|
9048
|
+
}
|
8103
9049
|
}
|
8104
9050
|
|
8105
|
-
//
|
8106
|
-
//
|
8107
|
-
|
8108
|
-
|
8109
|
-
|
9051
|
+
// wait for the computation to finish (automatically done when obtaining the model output)
|
9052
|
+
//llama_synchronize(&lctx);
|
9053
|
+
|
9054
|
+
// decide if we need to defrag the kv cache
|
9055
|
+
if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
|
9056
|
+
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
|
9057
|
+
|
9058
|
+
// queue defragmentation for next llama_kv_cache_update
|
9059
|
+
if (fragmentation > cparams.defrag_thold) {
|
9060
|
+
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
|
9061
|
+
|
9062
|
+
llama_kv_cache_defrag(kv_self);
|
9063
|
+
}
|
8110
9064
|
}
|
8111
9065
|
|
8112
9066
|
return 0;
|
8113
9067
|
}
|
8114
9068
|
|
9069
|
+
|
8115
9070
|
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
8116
9071
|
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
8117
9072
|
auto & kv_self = lctx.kv_self;
|
@@ -8130,6 +9085,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8130
9085
|
// number of cells moved
|
8131
9086
|
uint32_t n_moves = 0;
|
8132
9087
|
|
9088
|
+
// each move requires 6*n_layer tensors (see build_defrag)
|
9089
|
+
// - source view, destination view, copy operation
|
9090
|
+
// - x2 for keys and values
|
9091
|
+
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
9092
|
+
|
8133
9093
|
// determine which KV cells to move where
|
8134
9094
|
//
|
8135
9095
|
// cell i moves to ids[i]
|
@@ -8156,15 +9116,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8156
9116
|
nh++;
|
8157
9117
|
}
|
8158
9118
|
|
8159
|
-
// each move requires 6*n_layer tensors (see build_defrag)
|
8160
|
-
// - source view, destination view, copy operation
|
8161
|
-
// - x2 for keys and values
|
8162
|
-
//
|
8163
|
-
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
8164
|
-
// the graph is too big, we cannot move more cells
|
8165
|
-
break;
|
8166
|
-
}
|
8167
|
-
|
8168
9119
|
uint32_t nf = 0;
|
8169
9120
|
uint32_t is = n_kv - 1;
|
8170
9121
|
|
@@ -8194,11 +9145,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8194
9145
|
// are we moving a continuous block of memory?
|
8195
9146
|
bool cont = false;
|
8196
9147
|
|
9148
|
+
// should we stop searching for the next move?
|
9149
|
+
bool stop = false;
|
9150
|
+
|
8197
9151
|
// go back and move the nf cells to the hole
|
8198
9152
|
for (; i1 < n_kv; ++i1) {
|
8199
9153
|
auto & cell1 = kv_self.cells[i1];
|
8200
9154
|
|
8201
9155
|
if (cell1.is_empty() || ids[i1] != n_kv) {
|
9156
|
+
if (n_moves == max_moves) {
|
9157
|
+
stop = true;
|
9158
|
+
break;
|
9159
|
+
}
|
9160
|
+
|
8202
9161
|
cont = false;
|
8203
9162
|
continue;
|
8204
9163
|
}
|
@@ -8225,6 +9184,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8225
9184
|
}
|
8226
9185
|
}
|
8227
9186
|
|
9187
|
+
if (stop || n_moves == max_moves) {
|
9188
|
+
break;
|
9189
|
+
}
|
9190
|
+
|
8228
9191
|
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
8229
9192
|
|
8230
9193
|
i0 += nh - 1;
|
@@ -8311,6 +9274,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8311
9274
|
#else
|
8312
9275
|
// ggml_graph defrag
|
8313
9276
|
|
9277
|
+
ggml_backend_sched_reset(lctx.sched);
|
9278
|
+
|
8314
9279
|
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
8315
9280
|
|
8316
9281
|
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
@@ -8322,14 +9287,22 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
8322
9287
|
}
|
8323
9288
|
|
8324
9289
|
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
9290
|
+
bool need_reserve = false;
|
9291
|
+
|
8325
9292
|
// apply K-shift if needed
|
8326
9293
|
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
8327
|
-
llama_set_k_shift(lctx);
|
8328
|
-
|
8329
9294
|
{
|
9295
|
+
ggml_backend_sched_reset(lctx.sched);
|
9296
|
+
|
8330
9297
|
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
8331
9298
|
|
9299
|
+
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
9300
|
+
|
9301
|
+
llama_set_k_shift(lctx);
|
9302
|
+
|
8332
9303
|
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
9304
|
+
|
9305
|
+
need_reserve = true;
|
8333
9306
|
}
|
8334
9307
|
|
8335
9308
|
{
|
@@ -8343,12 +9316,56 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|
8343
9316
|
}
|
8344
9317
|
}
|
8345
9318
|
|
9319
|
+
if (lctx.kv_self.recurrent && lctx.kv_self.do_copy) {
|
9320
|
+
{
|
9321
|
+
ggml_backend_sched_reset(lctx.sched);
|
9322
|
+
|
9323
|
+
ggml_cgraph * gf = llama_build_graph_s_copy(lctx);
|
9324
|
+
|
9325
|
+
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
9326
|
+
|
9327
|
+
llama_set_s_copy(lctx);
|
9328
|
+
|
9329
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
9330
|
+
|
9331
|
+
need_reserve = true;
|
9332
|
+
}
|
9333
|
+
|
9334
|
+
{
|
9335
|
+
auto & kv_self = lctx.kv_self;
|
9336
|
+
|
9337
|
+
kv_self.do_copy = false;
|
9338
|
+
|
9339
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
9340
|
+
kv_self.cells[i].src = i;
|
9341
|
+
}
|
9342
|
+
}
|
9343
|
+
}
|
9344
|
+
|
8346
9345
|
// defragment the KV cache if needed
|
8347
9346
|
if (lctx.kv_self.do_defrag) {
|
8348
9347
|
llama_kv_cache_defrag_internal(lctx);
|
8349
9348
|
|
9349
|
+
need_reserve = true;
|
9350
|
+
|
8350
9351
|
lctx.kv_self.do_defrag = false;
|
8351
9352
|
}
|
9353
|
+
|
9354
|
+
// reserve a worst case graph again
|
9355
|
+
if (need_reserve) {
|
9356
|
+
// TODO: extract to a function
|
9357
|
+
// build worst-case graph
|
9358
|
+
int n_tokens = (int)std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
|
9359
|
+
int n_past = lctx.cparams.n_ctx - n_tokens;
|
9360
|
+
llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
9361
|
+
ggml_cgraph * gf = llama_build_graph(lctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
9362
|
+
|
9363
|
+
// initialize scheduler with the worst-case graph
|
9364
|
+
ggml_backend_sched_reset(lctx.sched);
|
9365
|
+
if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
|
9366
|
+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
9367
|
+
}
|
9368
|
+
}
|
8352
9369
|
}
|
8353
9370
|
|
8354
9371
|
//
|
@@ -8360,46 +9377,53 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
8360
9377
|
}
|
8361
9378
|
|
8362
9379
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
9380
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8363
9381
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
8364
9382
|
}
|
8365
9383
|
|
8366
9384
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
9385
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8367
9386
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
8368
9387
|
}
|
8369
9388
|
|
8370
9389
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
9390
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8371
9391
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
8372
9392
|
}
|
8373
9393
|
|
8374
9394
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
9395
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8375
9396
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
8376
9397
|
}
|
8377
9398
|
|
8378
9399
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
9400
|
+
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
8379
9401
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
8380
9402
|
}
|
8381
9403
|
|
8382
9404
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
9405
|
+
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
8383
9406
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
8384
9407
|
const auto& token_data = vocab.id_to_token.at(id);
|
8385
9408
|
switch (llama_vocab_get_type(vocab)) {
|
8386
|
-
|
8387
|
-
|
8388
|
-
|
8389
|
-
|
8390
|
-
|
8391
|
-
|
8392
|
-
|
8393
|
-
|
8394
|
-
|
8395
|
-
|
8396
|
-
|
8397
|
-
|
8398
|
-
|
9409
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
9410
|
+
auto buf = token_data.text.substr(3, 2);
|
9411
|
+
return strtol(buf.c_str(), NULL, 16);
|
9412
|
+
}
|
9413
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
9414
|
+
GGML_ASSERT(false);
|
9415
|
+
return unicode_utf8_to_byte(token_data.text);
|
9416
|
+
}
|
9417
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
9418
|
+
GGML_ASSERT(false);
|
9419
|
+
}
|
9420
|
+
default:
|
9421
|
+
GGML_ASSERT(false);
|
8399
9422
|
}
|
8400
9423
|
}
|
8401
9424
|
|
8402
9425
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
9426
|
+
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
8403
9427
|
static const char * hex = "0123456789ABCDEF";
|
8404
9428
|
switch (llama_vocab_get_type(vocab)) {
|
8405
9429
|
case LLAMA_VOCAB_TYPE_SPM: {
|
@@ -8414,7 +9438,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
8414
9438
|
}
|
8415
9439
|
case LLAMA_VOCAB_TYPE_WPM:
|
8416
9440
|
case LLAMA_VOCAB_TYPE_BPE: {
|
8417
|
-
return vocab.token_to_id.at(
|
9441
|
+
return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
|
8418
9442
|
}
|
8419
9443
|
default:
|
8420
9444
|
GGML_ASSERT(false);
|
@@ -8754,9 +9778,9 @@ private:
|
|
8754
9778
|
bpe_words.reserve(text.size());
|
8755
9779
|
bpe_encoded_words.reserve(text.size());
|
8756
9780
|
|
8757
|
-
auto
|
8758
|
-
for (size_t i = 0; i <
|
8759
|
-
text_utf.emplace_back(
|
9781
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
9782
|
+
for (size_t i = 0; i < cpts.size(); ++i)
|
9783
|
+
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
8760
9784
|
|
8761
9785
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
8762
9786
|
const std::string & utf_char = text_utf[i];
|
@@ -8806,40 +9830,40 @@ private:
|
|
8806
9830
|
}
|
8807
9831
|
|
8808
9832
|
if (!split_condition && !collecting) {
|
8809
|
-
if (
|
9833
|
+
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
8810
9834
|
collecting_letter = true;
|
8811
9835
|
collecting = true;
|
8812
9836
|
}
|
8813
|
-
else if (
|
9837
|
+
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
8814
9838
|
collecting_numeric = true;
|
8815
9839
|
collecting = true;
|
8816
9840
|
}
|
8817
9841
|
else if (
|
8818
|
-
((
|
8819
|
-
(!token.size() && utf_char == " " &&
|
9842
|
+
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
9843
|
+
(!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
8820
9844
|
) {
|
8821
9845
|
collecting_special = true;
|
8822
9846
|
collecting = true;
|
8823
9847
|
}
|
8824
|
-
else if (
|
9848
|
+
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
8825
9849
|
collecting_whitespace_lookahead = true;
|
8826
9850
|
collecting = true;
|
8827
9851
|
}
|
8828
|
-
else if (
|
9852
|
+
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
8829
9853
|
split_condition = true;
|
8830
9854
|
}
|
8831
9855
|
}
|
8832
9856
|
else if (!split_condition && collecting) {
|
8833
|
-
if (collecting_letter &&
|
9857
|
+
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
8834
9858
|
split_condition = true;
|
8835
9859
|
}
|
8836
|
-
else if (collecting_numeric &&
|
9860
|
+
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
8837
9861
|
split_condition = true;
|
8838
9862
|
}
|
8839
|
-
else if (collecting_special && (
|
9863
|
+
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
8840
9864
|
split_condition = true;
|
8841
9865
|
}
|
8842
|
-
else if (collecting_whitespace_lookahead && (
|
9866
|
+
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
8843
9867
|
split_condition = true;
|
8844
9868
|
}
|
8845
9869
|
}
|
@@ -8868,7 +9892,7 @@ private:
|
|
8868
9892
|
for (std::string & word : bpe_words) {
|
8869
9893
|
std::string encoded_token = "";
|
8870
9894
|
for (char & c : word) {
|
8871
|
-
encoded_token +=
|
9895
|
+
encoded_token += unicode_byte_to_utf8(c);
|
8872
9896
|
}
|
8873
9897
|
bpe_encoded_words.emplace_back(encoded_token);
|
8874
9898
|
}
|
@@ -8942,25 +9966,13 @@ struct llm_tokenizer_wpm {
|
|
8942
9966
|
}
|
8943
9967
|
|
8944
9968
|
std::vector<std::string> preprocess(const std::string & text) {
|
8945
|
-
|
8946
|
-
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
8947
|
-
std::vector<uint32_t> nfd_codepoints;
|
8948
|
-
for (uint32_t code : codepoints) {
|
8949
|
-
auto it = nfd_map.equal_range(code);
|
8950
|
-
if (it.first != it.second) {
|
8951
|
-
for (auto jt = it.first; jt != it.second; jt++) {
|
8952
|
-
nfd_codepoints.push_back(jt->second);
|
8953
|
-
}
|
8954
|
-
} else {
|
8955
|
-
nfd_codepoints.push_back(code);
|
8956
|
-
}
|
8957
|
-
}
|
9969
|
+
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
8958
9970
|
|
8959
9971
|
// strip accents, strip control, uniformize whitespace,
|
8960
9972
|
// to lowercase, pad chinese characters, pad punctuation
|
8961
9973
|
std::string new_str = "";
|
8962
|
-
for (uint32_t code :
|
8963
|
-
int type =
|
9974
|
+
for (uint32_t code : cpts_nfd) {
|
9975
|
+
int type = unicode_cpt_type(code);
|
8964
9976
|
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
8965
9977
|
continue;
|
8966
9978
|
}
|
@@ -8968,7 +9980,7 @@ struct llm_tokenizer_wpm {
|
|
8968
9980
|
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
8969
9981
|
code = ' ';
|
8970
9982
|
}
|
8971
|
-
std::string s =
|
9983
|
+
std::string s = unicode_cpt_to_utf8(code);
|
8972
9984
|
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
|
8973
9985
|
new_str += " ";
|
8974
9986
|
new_str += s;
|
@@ -8988,8 +10000,7 @@ struct llm_tokenizer_wpm {
|
|
8988
10000
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
8989
10001
|
l = r + 1;
|
8990
10002
|
r = l;
|
8991
|
-
}
|
8992
|
-
else {
|
10003
|
+
} else {
|
8993
10004
|
r += 1;
|
8994
10005
|
}
|
8995
10006
|
}
|
@@ -9013,17 +10024,17 @@ struct llm_tokenizer_wpm {
|
|
9013
10024
|
return code < 256 && ispunct(code);
|
9014
10025
|
}
|
9015
10026
|
|
9016
|
-
bool is_chinese_char(uint32_t
|
9017
|
-
if ((
|
9018
|
-
(
|
9019
|
-
(
|
9020
|
-
(
|
9021
|
-
(
|
9022
|
-
(
|
9023
|
-
(
|
9024
|
-
(
|
9025
|
-
(
|
9026
|
-
(
|
10027
|
+
bool is_chinese_char(uint32_t cpt) {
|
10028
|
+
if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
|
10029
|
+
(cpt >= 0x3400 && cpt <= 0x4DBF) ||
|
10030
|
+
(cpt >= 0x20000 && cpt <= 0x2A6DF) ||
|
10031
|
+
(cpt >= 0x2A700 && cpt <= 0x2B73F) ||
|
10032
|
+
(cpt >= 0x2B740 && cpt <= 0x2B81F) ||
|
10033
|
+
(cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
10034
|
+
(cpt >= 0xF900 && cpt <= 0xFAFF) ||
|
10035
|
+
(cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
|
10036
|
+
(cpt >= 0x3000 && cpt <= 0x303F) ||
|
10037
|
+
(cpt >= 0xFF00 && cpt <= 0xFFEF)) {
|
9027
10038
|
return true; // NOLINT
|
9028
10039
|
}
|
9029
10040
|
return false;
|
@@ -9244,6 +10255,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
9244
10255
|
}
|
9245
10256
|
}
|
9246
10257
|
} break;
|
10258
|
+
case LLAMA_VOCAB_TYPE_NONE:
|
10259
|
+
GGML_ASSERT(false);
|
9247
10260
|
}
|
9248
10261
|
|
9249
10262
|
return output;
|
@@ -9600,7 +10613,7 @@ struct llama_grammar * llama_grammar_init(
|
|
9600
10613
|
|
9601
10614
|
// loop over alternates of start rule to build initial stacks
|
9602
10615
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
9603
|
-
pos =
|
10616
|
+
pos = vec_rules[start_rule_index].data();
|
9604
10617
|
do {
|
9605
10618
|
std::vector<const llama_grammar_element *> stack;
|
9606
10619
|
if (!llama_grammar_is_end_of_sequence(pos)) {
|
@@ -10615,13 +11628,16 @@ struct quantize_state_internal {
|
|
10615
11628
|
|
10616
11629
|
bool has_imatrix = false;
|
10617
11630
|
|
11631
|
+
// used to figure out if a model shares tok_embd with the output weight
|
11632
|
+
bool has_output = false;
|
11633
|
+
|
10618
11634
|
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
10619
11635
|
: model(model)
|
10620
11636
|
, params(params)
|
10621
11637
|
{}
|
10622
11638
|
};
|
10623
11639
|
|
10624
|
-
static void
|
11640
|
+
static void llama_tensor_dequantize_internal(
|
10625
11641
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
10626
11642
|
const size_t nelements, const int nthread
|
10627
11643
|
) {
|
@@ -10682,7 +11698,7 @@ static void llama_convert_tensor_internal(
|
|
10682
11698
|
workers.clear();
|
10683
11699
|
}
|
10684
11700
|
|
10685
|
-
static ggml_type
|
11701
|
+
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
10686
11702
|
const std::string name = ggml_get_name(tensor);
|
10687
11703
|
|
10688
11704
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
@@ -10712,8 +11728,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10712
11728
|
|
10713
11729
|
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
10714
11730
|
// with the quantization of the output tensor
|
10715
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
|
10716
|
-
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
|
11731
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
10717
11732
|
int nx = tensor->ne[0];
|
10718
11733
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
10719
11734
|
new_type = GGML_TYPE_Q8_0;
|
@@ -10962,41 +11977,76 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10962
11977
|
return new_type;
|
10963
11978
|
}
|
10964
11979
|
|
11980
|
+
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
11981
|
+
std::mutex mutex;
|
11982
|
+
int counter = 0;
|
11983
|
+
size_t new_size = 0;
|
11984
|
+
if (nthread < 2) {
|
11985
|
+
// single-thread
|
11986
|
+
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
11987
|
+
}
|
11988
|
+
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
11989
|
+
nrows, n_per_row, imatrix]() {
|
11990
|
+
const int nrows_per_chunk = chunk_size / n_per_row;
|
11991
|
+
size_t local_size = 0;
|
11992
|
+
while (true) {
|
11993
|
+
std::unique_lock<std::mutex> lock(mutex);
|
11994
|
+
int first_row = counter; counter += nrows_per_chunk;
|
11995
|
+
if (first_row >= nrows) {
|
11996
|
+
if (local_size > 0) {
|
11997
|
+
new_size += local_size;
|
11998
|
+
}
|
11999
|
+
break;
|
12000
|
+
}
|
12001
|
+
lock.unlock();
|
12002
|
+
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
12003
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
12004
|
+
}
|
12005
|
+
};
|
12006
|
+
for (int it = 0; it < nthread - 1; ++it) {
|
12007
|
+
workers.emplace_back(compute);
|
12008
|
+
}
|
12009
|
+
compute();
|
12010
|
+
for (auto & w : workers) { w.join(); }
|
12011
|
+
workers.clear();
|
12012
|
+
return new_size;
|
12013
|
+
}
|
12014
|
+
|
10965
12015
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
10966
|
-
ggml_type
|
12016
|
+
ggml_type default_type;
|
10967
12017
|
llama_ftype ftype = params->ftype;
|
10968
12018
|
|
10969
12019
|
switch (params->ftype) {
|
10970
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0:
|
10971
|
-
case LLAMA_FTYPE_MOSTLY_Q4_1:
|
10972
|
-
case LLAMA_FTYPE_MOSTLY_Q5_0:
|
10973
|
-
case LLAMA_FTYPE_MOSTLY_Q5_1:
|
10974
|
-
case LLAMA_FTYPE_MOSTLY_Q8_0:
|
10975
|
-
case LLAMA_FTYPE_MOSTLY_F16:
|
10976
|
-
case LLAMA_FTYPE_ALL_F32:
|
12020
|
+
case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
|
12021
|
+
case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
|
12022
|
+
case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
|
12023
|
+
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
12024
|
+
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
12025
|
+
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
12026
|
+
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
10977
12027
|
|
10978
12028
|
// K-quants
|
10979
12029
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
10980
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
10981
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XS:
|
12030
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
|
12031
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
|
10982
12032
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
10983
12033
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
10984
|
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
12034
|
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
|
10985
12035
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
10986
|
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
12036
|
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
|
10987
12037
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
10988
|
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
10989
|
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
10990
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
|
10991
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS:
|
10992
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_S:
|
10993
|
-
case LLAMA_FTYPE_MOSTLY_IQ2_M:
|
10994
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
|
10995
|
-
case LLAMA_FTYPE_MOSTLY_IQ1_S:
|
10996
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_NL:
|
10997
|
-
case LLAMA_FTYPE_MOSTLY_IQ4_XS:
|
10998
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
10999
|
-
case LLAMA_FTYPE_MOSTLY_IQ3_M:
|
12038
|
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
|
12039
|
+
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
|
12040
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
|
12041
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
|
12042
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
|
12043
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
|
12044
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
|
12045
|
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
|
12046
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
|
12047
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
12048
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
12049
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
11000
12050
|
|
11001
12051
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
11002
12052
|
}
|
@@ -11062,6 +12112,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11062
12112
|
else if (name.find("ffn_up") != std::string::npos) {
|
11063
12113
|
++qs.n_ffn_up;
|
11064
12114
|
}
|
12115
|
+
else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
12116
|
+
qs.has_output = true;
|
12117
|
+
}
|
11065
12118
|
}
|
11066
12119
|
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
|
11067
12120
|
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
|
@@ -11070,11 +12123,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11070
12123
|
|
11071
12124
|
size_t total_size_org = 0;
|
11072
12125
|
size_t total_size_new = 0;
|
11073
|
-
std::vector<int64_t> hist_all(1 << 4, 0);
|
11074
12126
|
|
11075
12127
|
std::vector<std::thread> workers;
|
11076
12128
|
workers.reserve(nthread);
|
11077
|
-
std::mutex mutex;
|
11078
12129
|
|
11079
12130
|
int idx = 0;
|
11080
12131
|
|
@@ -11133,20 +12184,29 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11133
12184
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
11134
12185
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
11135
12186
|
|
12187
|
+
// do not quantize Mamba's small yet 2D weights
|
12188
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
12189
|
+
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
12190
|
+
quantize &= name.find("ssm_x.weight") == std::string::npos;
|
12191
|
+
quantize &= name.find("ssm_dt.weight") == std::string::npos;
|
12192
|
+
|
11136
12193
|
enum ggml_type new_type;
|
11137
12194
|
void * new_data;
|
11138
12195
|
size_t new_size;
|
11139
12196
|
|
11140
12197
|
if (quantize) {
|
11141
|
-
new_type =
|
11142
|
-
|
11143
|
-
|
12198
|
+
new_type = default_type;
|
12199
|
+
|
12200
|
+
// get more optimal quantization type based on the tensor shape, layer, etc.
|
12201
|
+
if (!params->pure && ggml_is_quantized(default_type)) {
|
12202
|
+
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
11144
12203
|
}
|
11145
12204
|
|
11146
12205
|
// If we've decided to quantize to the same type the tensor is already
|
11147
12206
|
// in then there's nothing to do.
|
11148
12207
|
quantize = tensor->type != new_type;
|
11149
12208
|
}
|
12209
|
+
|
11150
12210
|
if (!quantize) {
|
11151
12211
|
new_type = tensor->type;
|
11152
12212
|
new_data = tensor->data;
|
@@ -11188,18 +12248,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11188
12248
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
11189
12249
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
11190
12250
|
} else {
|
11191
|
-
|
12251
|
+
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
11192
12252
|
f32_data = (float *) f32_conv_buf.data();
|
11193
12253
|
}
|
11194
12254
|
|
11195
|
-
LLAMA_LOG_INFO("
|
12255
|
+
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
11196
12256
|
fflush(stdout);
|
11197
12257
|
|
11198
12258
|
if (work.size() < nelements * 4) {
|
11199
12259
|
work.resize(nelements * 4); // upper bound on size
|
11200
12260
|
}
|
11201
12261
|
new_data = work.data();
|
11202
|
-
std::array<int64_t, 1 << 4> hist_cur = {};
|
11203
12262
|
|
11204
12263
|
const int n_per_row = tensor->ne[0];
|
11205
12264
|
const int nrows = nelements / n_per_row;
|
@@ -11209,56 +12268,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11209
12268
|
|
11210
12269
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
11211
12270
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
11212
|
-
|
11213
|
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
11214
|
-
} else {
|
11215
|
-
int counter = 0;
|
11216
|
-
new_size = 0;
|
11217
|
-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
11218
|
-
nrows, n_per_row, imatrix]() {
|
11219
|
-
std::array<int64_t, 1 << 4> local_hist = {};
|
11220
|
-
const int nrows_per_chunk = chunk_size / n_per_row;
|
11221
|
-
size_t local_size = 0;
|
11222
|
-
while (true) {
|
11223
|
-
std::unique_lock<std::mutex> lock(mutex);
|
11224
|
-
int first_row = counter; counter += nrows_per_chunk;
|
11225
|
-
if (first_row >= nrows) {
|
11226
|
-
if (local_size > 0) {
|
11227
|
-
for (int j=0; j<int(local_hist.size()); ++j) {
|
11228
|
-
hist_cur[j] += local_hist[j];
|
11229
|
-
}
|
11230
|
-
new_size += local_size;
|
11231
|
-
}
|
11232
|
-
break;
|
11233
|
-
}
|
11234
|
-
lock.unlock();
|
11235
|
-
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
11236
|
-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
11237
|
-
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
11238
|
-
}
|
11239
|
-
};
|
11240
|
-
for (int it = 0; it < nthread_use - 1; ++it) {
|
11241
|
-
workers.emplace_back(compute);
|
11242
|
-
}
|
11243
|
-
compute();
|
11244
|
-
for (auto & w : workers) { w.join(); }
|
11245
|
-
workers.clear();
|
11246
|
-
}
|
11247
|
-
|
11248
|
-
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
11249
|
-
int64_t tot_count = 0;
|
11250
|
-
for (size_t i = 0; i < hist_cur.size(); i++) {
|
11251
|
-
hist_all[i] += hist_cur[i];
|
11252
|
-
tot_count += hist_cur[i];
|
11253
|
-
}
|
12271
|
+
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
|
11254
12272
|
|
11255
|
-
|
11256
|
-
LLAMA_LOG_INFO(" | hist: ");
|
11257
|
-
for (size_t i = 0; i < hist_cur.size(); i++) {
|
11258
|
-
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
11259
|
-
}
|
11260
|
-
}
|
11261
|
-
LLAMA_LOG_INFO("\n");
|
12273
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
11262
12274
|
}
|
11263
12275
|
total_size_org += ggml_nbytes(tensor);
|
11264
12276
|
total_size_new += new_size;
|
@@ -11287,24 +12299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
11287
12299
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
11288
12300
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
11289
12301
|
|
11290
|
-
// print histogram for all tensors
|
11291
|
-
{
|
11292
|
-
int64_t sum_all = 0;
|
11293
|
-
for (size_t i = 0; i < hist_all.size(); i++) {
|
11294
|
-
sum_all += hist_all[i];
|
11295
|
-
}
|
11296
|
-
|
11297
|
-
if (sum_all > 0) {
|
11298
|
-
LLAMA_LOG_INFO("%s: hist: ", __func__);
|
11299
|
-
for (size_t i = 0; i < hist_all.size(); i++) {
|
11300
|
-
LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
|
11301
|
-
}
|
11302
|
-
LLAMA_LOG_INFO("\n");
|
11303
|
-
}
|
11304
|
-
}
|
11305
|
-
|
11306
12302
|
if (qs.n_fallback > 0) {
|
11307
|
-
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s)
|
12303
|
+
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
11308
12304
|
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
|
11309
12305
|
}
|
11310
12306
|
}
|
@@ -11616,10 +12612,13 @@ struct llama_context_params llama_context_default_params() {
|
|
11616
12612
|
struct llama_context_params result = {
|
11617
12613
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
11618
12614
|
/*.n_ctx =*/ 512,
|
11619
|
-
/*.n_batch =*/
|
12615
|
+
/*.n_batch =*/ 2048,
|
12616
|
+
/*.n_ubatch =*/ 512,
|
12617
|
+
/*.n_seq_max =*/ 1,
|
11620
12618
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
11621
12619
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
11622
12620
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
12621
|
+
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
11623
12622
|
/*.rope_freq_base =*/ 0.0f,
|
11624
12623
|
/*.rope_freq_scale =*/ 0.0f,
|
11625
12624
|
/*.yarn_ext_factor =*/ -1.0f,
|
@@ -11633,9 +12632,10 @@ struct llama_context_params llama_context_default_params() {
|
|
11633
12632
|
/*.type_k =*/ GGML_TYPE_F16,
|
11634
12633
|
/*.type_v =*/ GGML_TYPE_F16,
|
11635
12634
|
/*.logits_all =*/ false,
|
11636
|
-
/*.
|
12635
|
+
/*.embeddings =*/ false,
|
11637
12636
|
/*.offload_kqv =*/ true,
|
11638
|
-
/*.
|
12637
|
+
/*.abort_callback =*/ nullptr,
|
12638
|
+
/*.abort_callback_data =*/ nullptr,
|
11639
12639
|
};
|
11640
12640
|
|
11641
12641
|
return result;
|
@@ -11767,6 +12767,17 @@ struct llama_context * llama_new_context_with_model(
|
|
11767
12767
|
struct llama_context_params params) {
|
11768
12768
|
|
11769
12769
|
if (!model) {
|
12770
|
+
LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
|
12771
|
+
return nullptr;
|
12772
|
+
}
|
12773
|
+
|
12774
|
+
if (params.n_batch == 0 && params.n_ubatch == 0) {
|
12775
|
+
LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
|
12776
|
+
return nullptr;
|
12777
|
+
}
|
12778
|
+
|
12779
|
+
if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
|
12780
|
+
LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
|
11770
12781
|
return nullptr;
|
11771
12782
|
}
|
11772
12783
|
|
@@ -11775,7 +12786,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11775
12786
|
const auto & hparams = model->hparams;
|
11776
12787
|
auto & cparams = ctx->cparams;
|
11777
12788
|
|
11778
|
-
|
12789
|
+
// TODO: maybe add n_seq_max here too
|
11779
12790
|
cparams.n_threads = params.n_threads;
|
11780
12791
|
cparams.n_threads_batch = params.n_threads_batch;
|
11781
12792
|
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
@@ -11783,13 +12794,19 @@ struct llama_context * llama_new_context_with_model(
|
|
11783
12794
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
11784
12795
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
11785
12796
|
cparams.defrag_thold = params.defrag_thold;
|
12797
|
+
cparams.embeddings = params.embeddings;
|
11786
12798
|
cparams.offload_kqv = params.offload_kqv;
|
11787
|
-
cparams.
|
12799
|
+
cparams.pooling_type = params.pooling_type;
|
11788
12800
|
|
11789
12801
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
11790
12802
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
11791
12803
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
11792
12804
|
|
12805
|
+
// with causal attention, the batch size is limited by the context size
|
12806
|
+
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
12807
|
+
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
12808
|
+
|
12809
|
+
|
11793
12810
|
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
11794
12811
|
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
11795
12812
|
hparams.n_ctx_train;
|
@@ -11810,19 +12827,44 @@ struct llama_context * llama_new_context_with_model(
|
|
11810
12827
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
11811
12828
|
}
|
11812
12829
|
|
12830
|
+
cparams.causal_attn = hparams.causal_attn;
|
12831
|
+
|
12832
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
12833
|
+
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
12834
|
+
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
12835
|
+
} else {
|
12836
|
+
cparams.pooling_type = hparams.pooling_type;
|
12837
|
+
}
|
12838
|
+
}
|
12839
|
+
|
11813
12840
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
11814
12841
|
params.seed = time(NULL);
|
11815
12842
|
}
|
11816
12843
|
|
11817
12844
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
12845
|
+
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
12846
|
+
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
11818
12847
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
11819
12848
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
11820
12849
|
|
11821
|
-
ctx->
|
11822
|
-
ctx->
|
12850
|
+
ctx->abort_callback = params.abort_callback;
|
12851
|
+
ctx->abort_callback_data = params.abort_callback_data;
|
12852
|
+
|
12853
|
+
ctx->rng = std::mt19937(params.seed);
|
12854
|
+
ctx->logits_all = params.logits_all;
|
11823
12855
|
|
11824
|
-
|
11825
|
-
|
12856
|
+
uint32_t kv_size = cparams.n_ctx;
|
12857
|
+
ggml_type type_k = params.type_k;
|
12858
|
+
ggml_type type_v = params.type_v;
|
12859
|
+
|
12860
|
+
// Mamba only needs a constant number of KV cache cells per sequence
|
12861
|
+
if (model->arch == LLM_ARCH_MAMBA) {
|
12862
|
+
// Mamba needs at least as many KV cells as there are sequences kept at any time
|
12863
|
+
kv_size = std::max((uint32_t) 1, params.n_seq_max);
|
12864
|
+
// it's probably best to keep as much precision as possible for the states
|
12865
|
+
type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
|
12866
|
+
type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
|
12867
|
+
}
|
11826
12868
|
|
11827
12869
|
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
11828
12870
|
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
@@ -11877,13 +12919,31 @@ struct llama_context * llama_new_context_with_model(
|
|
11877
12919
|
}
|
11878
12920
|
#elif defined(GGML_USE_SYCL)
|
11879
12921
|
if (model->n_gpu_layers > 0) {
|
11880
|
-
|
11881
|
-
if (
|
11882
|
-
|
11883
|
-
|
11884
|
-
|
12922
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
12923
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
12924
|
+
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
12925
|
+
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
12926
|
+
if (backend == nullptr) {
|
12927
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
12928
|
+
llama_free(ctx);
|
12929
|
+
return nullptr;
|
12930
|
+
}
|
12931
|
+
ctx->backends.push_back(backend);
|
12932
|
+
} else {
|
12933
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
12934
|
+
int id_list[GGML_SYCL_MAX_DEVICES];
|
12935
|
+
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
12936
|
+
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
12937
|
+
int device_id = id_list[i];
|
12938
|
+
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
12939
|
+
if (backend == nullptr) {
|
12940
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
12941
|
+
llama_free(ctx);
|
12942
|
+
return nullptr;
|
12943
|
+
}
|
12944
|
+
ctx->backends.push_back(backend);
|
12945
|
+
}
|
11885
12946
|
}
|
11886
|
-
ctx->backends.push_back(backend);
|
11887
12947
|
}
|
11888
12948
|
#elif defined(GGML_USE_KOMPUTE)
|
11889
12949
|
if (model->n_gpu_layers > 0) {
|
@@ -11904,7 +12964,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11904
12964
|
}
|
11905
12965
|
ctx->backends.push_back(ctx->backend_cpu);
|
11906
12966
|
|
11907
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
12967
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
11908
12968
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
11909
12969
|
llama_free(ctx);
|
11910
12970
|
return nullptr;
|
@@ -11928,45 +12988,31 @@ struct llama_context * llama_new_context_with_model(
|
|
11928
12988
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
11929
12989
|
}
|
11930
12990
|
|
11931
|
-
//
|
11932
|
-
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
11933
|
-
|
11934
|
-
if (params.embedding) {
|
11935
|
-
ctx->embedding.resize(hparams.n_embd);
|
11936
|
-
}
|
11937
|
-
|
11938
|
-
// graph inputs
|
12991
|
+
// graph outputs buffer
|
11939
12992
|
{
|
11940
|
-
|
11941
|
-
|
11942
|
-
|
11943
|
-
/* .no_alloc */ true,
|
11944
|
-
};
|
11945
|
-
ctx->ctx_input = ggml_init(init_params);
|
12993
|
+
// resized during inference, reserve maximum
|
12994
|
+
ctx->logits_size = hparams.n_vocab*cparams.n_batch;
|
12995
|
+
ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
|
11946
12996
|
|
11947
|
-
|
11948
|
-
|
11949
|
-
ctx->
|
11950
|
-
|
11951
|
-
|
11952
|
-
|
11953
|
-
|
11954
|
-
|
12997
|
+
const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
|
12998
|
+
|
12999
|
+
ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
|
13000
|
+
if (ctx->buf_output == nullptr) {
|
13001
|
+
LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
|
13002
|
+
llama_free(ctx);
|
13003
|
+
return nullptr;
|
13004
|
+
}
|
13005
|
+
ggml_backend_buffer_clear(ctx->buf_output, 0);
|
11955
13006
|
|
11956
|
-
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
11957
|
-
ggml_set_name(ctx->inp_embd, "inp_embd");
|
11958
|
-
ggml_set_name(ctx->inp_pos, "inp_pos");
|
11959
|
-
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
11960
|
-
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
11961
|
-
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
11962
|
-
ggml_set_name(ctx->inp_mean, "inp_mean");
|
11963
|
-
ggml_set_name(ctx->inp_cls, "inp_cls");
|
11964
13007
|
|
11965
|
-
ctx->
|
13008
|
+
ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
|
13009
|
+
if (params.embeddings) {
|
13010
|
+
ctx->embd = ctx->logits + ctx->logits_size;
|
13011
|
+
}
|
11966
13012
|
|
11967
|
-
LLAMA_LOG_INFO("%s: %10s
|
11968
|
-
ggml_backend_buffer_name(ctx->
|
11969
|
-
ggml_backend_buffer_get_size(ctx->
|
13013
|
+
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
|
13014
|
+
ggml_backend_buffer_name(ctx->buf_output),
|
13015
|
+
ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
|
11970
13016
|
}
|
11971
13017
|
|
11972
13018
|
// scheduler and compute buffers
|
@@ -11985,10 +13031,21 @@ struct llama_context * llama_new_context_with_model(
|
|
11985
13031
|
// buffer used to store the computation graph and the tensor meta data
|
11986
13032
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
11987
13033
|
|
11988
|
-
|
13034
|
+
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
13035
|
+
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
13036
|
+
#ifndef GGML_USE_CUBLAS
|
13037
|
+
// pipeline parallelism requires support for async compute and events
|
13038
|
+
// currently this is only implemented in the CUDA backend
|
13039
|
+
pipeline_parallel = false;
|
13040
|
+
#endif
|
13041
|
+
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
|
13042
|
+
|
13043
|
+
if (pipeline_parallel) {
|
13044
|
+
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
|
13045
|
+
}
|
11989
13046
|
|
11990
13047
|
// build worst-case graph
|
11991
|
-
int n_tokens = (int)std::min(cparams.n_ctx, cparams.
|
13048
|
+
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
|
11992
13049
|
int n_past = cparams.n_ctx - n_tokens;
|
11993
13050
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
11994
13051
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
@@ -12011,7 +13068,7 @@ struct llama_context * llama_new_context_with_model(
|
|
12011
13068
|
|
12012
13069
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
12013
13070
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
12014
|
-
LLAMA_LOG_INFO("%s: graph splits
|
13071
|
+
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
|
12015
13072
|
}
|
12016
13073
|
}
|
12017
13074
|
|
@@ -12048,6 +13105,14 @@ uint32_t llama_n_batch(const struct llama_context * ctx) {
|
|
12048
13105
|
return ctx->cparams.n_batch;
|
12049
13106
|
}
|
12050
13107
|
|
13108
|
+
uint32_t llama_n_ubatch(const struct llama_context * ctx) {
|
13109
|
+
return ctx->cparams.n_ubatch;
|
13110
|
+
}
|
13111
|
+
|
13112
|
+
uint32_t llama_n_seq_max(const struct llama_context * ctx) {
|
13113
|
+
return ctx->kv_self.size;
|
13114
|
+
}
|
13115
|
+
|
12051
13116
|
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
12052
13117
|
return model->vocab.type;
|
12053
13118
|
}
|
@@ -12061,6 +13126,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
12061
13126
|
case LLM_ARCH_MPT:
|
12062
13127
|
case LLM_ARCH_REFACT:
|
12063
13128
|
case LLM_ARCH_BLOOM:
|
13129
|
+
case LLM_ARCH_MAMBA:
|
12064
13130
|
return LLAMA_ROPE_TYPE_NONE;
|
12065
13131
|
|
12066
13132
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
@@ -12084,6 +13150,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
12084
13150
|
case LLM_ARCH_QWEN2:
|
12085
13151
|
case LLM_ARCH_PHI2:
|
12086
13152
|
case LLM_ARCH_GEMMA:
|
13153
|
+
case LLM_ARCH_STARCODER2:
|
12087
13154
|
return LLAMA_ROPE_TYPE_NEOX;
|
12088
13155
|
|
12089
13156
|
// all model arches should be listed explicitly here
|
@@ -12096,7 +13163,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
12096
13163
|
}
|
12097
13164
|
|
12098
13165
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
12099
|
-
return model->
|
13166
|
+
return model->hparams.n_vocab;
|
12100
13167
|
}
|
12101
13168
|
|
12102
13169
|
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
@@ -12206,10 +13273,10 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
|
|
12206
13273
|
}
|
12207
13274
|
}
|
12208
13275
|
|
12209
|
-
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t
|
13276
|
+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
|
12210
13277
|
struct llama_kv_cache_view result = {
|
12211
13278
|
/*.n_cells = */ 0,
|
12212
|
-
/*.
|
13279
|
+
/*.n_seq_max = */ n_seq_max,
|
12213
13280
|
/*.token_count = */ 0,
|
12214
13281
|
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
12215
13282
|
/*.max_contiguous = */ 0,
|
@@ -12237,7 +13304,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
12237
13304
|
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
12238
13305
|
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
12239
13306
|
view->cells = (struct llama_kv_cache_view_cell *)p;
|
12240
|
-
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->
|
13307
|
+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
|
12241
13308
|
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
12242
13309
|
view->cells_sequences = (llama_seq_id *)p;
|
12243
13310
|
}
|
@@ -12251,7 +13318,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
12251
13318
|
uint32_t max_contig = 0;
|
12252
13319
|
int32_t max_contig_idx = -1;
|
12253
13320
|
|
12254
|
-
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->
|
13321
|
+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) {
|
12255
13322
|
const size_t curr_size = kv_cells[i].seq_id.size();
|
12256
13323
|
token_count += curr_size;
|
12257
13324
|
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
@@ -12268,7 +13335,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
12268
13335
|
|
12269
13336
|
int seq_idx = 0;
|
12270
13337
|
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
12271
|
-
if (seq_idx >= view->
|
13338
|
+
if (seq_idx >= view->n_seq_max) {
|
12272
13339
|
break;
|
12273
13340
|
}
|
12274
13341
|
cs_curr[seq_idx] = it;
|
@@ -12277,7 +13344,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|
12277
13344
|
if (seq_idx != 0) {
|
12278
13345
|
used_cells++;
|
12279
13346
|
}
|
12280
|
-
for (; seq_idx < view->
|
13347
|
+
for (; seq_idx < view->n_seq_max; seq_idx++) {
|
12281
13348
|
cs_curr[seq_idx] = -1;
|
12282
13349
|
}
|
12283
13350
|
}
|
@@ -12313,8 +13380,8 @@ void llama_kv_cache_clear(struct llama_context * ctx) {
|
|
12313
13380
|
llama_kv_cache_clear(ctx->kv_self);
|
12314
13381
|
}
|
12315
13382
|
|
12316
|
-
|
12317
|
-
llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
13383
|
+
bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
13384
|
+
return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
12318
13385
|
}
|
12319
13386
|
|
12320
13387
|
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
@@ -12365,12 +13432,17 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
12365
13432
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
12366
13433
|
const size_t s_logits_size = sizeof(size_t);
|
12367
13434
|
// assume worst case for logits although only currently set ones are serialized
|
12368
|
-
const size_t s_logits = ctx->
|
13435
|
+
const size_t s_logits = ctx->logits_size * sizeof(float);
|
12369
13436
|
const size_t s_embedding_size = sizeof(size_t);
|
12370
|
-
const size_t s_embedding = ctx->
|
12371
|
-
const size_t
|
12372
|
-
const size_t
|
13437
|
+
const size_t s_embedding = ctx->embd_size * sizeof(float);
|
13438
|
+
const size_t s_kv_buf_size = sizeof(size_t);
|
13439
|
+
const size_t s_kv_head = sizeof(uint32_t);
|
13440
|
+
const size_t s_kv_size = sizeof(uint32_t);
|
13441
|
+
const size_t s_kv_used = sizeof(uint32_t);
|
12373
13442
|
const size_t s_kv = ctx->kv_self.total_size();
|
13443
|
+
// TODO: assume the max is more than 1 seq_id per KV cell
|
13444
|
+
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
13445
|
+
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
12374
13446
|
|
12375
13447
|
const size_t s_total = (
|
12376
13448
|
+ s_rng_size
|
@@ -12379,9 +13451,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
12379
13451
|
+ s_logits
|
12380
13452
|
+ s_embedding_size
|
12381
13453
|
+ s_embedding
|
13454
|
+
+ s_kv_buf_size
|
13455
|
+
+ s_kv_head
|
12382
13456
|
+ s_kv_size
|
12383
|
-
+
|
13457
|
+
+ s_kv_used
|
12384
13458
|
+ s_kv
|
13459
|
+
+ s_kv_cells
|
12385
13460
|
);
|
12386
13461
|
|
12387
13462
|
return s_total;
|
@@ -12457,23 +13532,23 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12457
13532
|
|
12458
13533
|
// copy logits
|
12459
13534
|
{
|
12460
|
-
const size_t logits_size = ctx->
|
13535
|
+
const size_t logits_size = ctx->logits_size;
|
12461
13536
|
|
12462
13537
|
data_ctx->write(&logits_size, sizeof(logits_size));
|
12463
13538
|
|
12464
13539
|
if (logits_size) {
|
12465
|
-
data_ctx->write(ctx->logits
|
13540
|
+
data_ctx->write(ctx->logits, logits_size * sizeof(float));
|
12466
13541
|
}
|
12467
13542
|
}
|
12468
13543
|
|
12469
13544
|
// copy embeddings
|
12470
13545
|
{
|
12471
|
-
const size_t
|
13546
|
+
const size_t embeddings_size = ctx->embd_size;
|
12472
13547
|
|
12473
|
-
data_ctx->write(&
|
13548
|
+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
|
12474
13549
|
|
12475
|
-
if (
|
12476
|
-
data_ctx->write(ctx->
|
13550
|
+
if (embeddings_size) {
|
13551
|
+
data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
|
12477
13552
|
}
|
12478
13553
|
}
|
12479
13554
|
|
@@ -12481,15 +13556,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12481
13556
|
{
|
12482
13557
|
const auto & kv_self = ctx->kv_self;
|
12483
13558
|
const auto & hparams = ctx->model.hparams;
|
12484
|
-
const auto & cparams = ctx->cparams;
|
12485
13559
|
|
12486
13560
|
const uint32_t n_layer = hparams.n_layer;
|
12487
|
-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12488
|
-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12489
|
-
const uint32_t n_ctx = cparams.n_ctx;
|
13561
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13562
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
12490
13563
|
|
12491
13564
|
const size_t kv_buf_size = kv_self.total_size();
|
12492
|
-
const uint32_t kv_head = kv_self
|
13565
|
+
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
12493
13566
|
const uint32_t kv_size = kv_self.size;
|
12494
13567
|
const uint32_t kv_used = kv_self.used;
|
12495
13568
|
|
@@ -12507,9 +13580,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12507
13580
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
12508
13581
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
12509
13582
|
|
13583
|
+
if (kv_self.recurrent) {
|
13584
|
+
// v is contiguous for recurrent models
|
13585
|
+
// TODO: use other tensors for state models than k and v
|
13586
|
+
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
13587
|
+
|
13588
|
+
tmp_buf.resize(v_size);
|
13589
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
13590
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
13591
|
+
continue;
|
13592
|
+
}
|
13593
|
+
|
12510
13594
|
// v is not contiguous, copy row by row
|
12511
13595
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12512
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
13596
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
12513
13597
|
|
12514
13598
|
tmp_buf.resize(v_row_size);
|
12515
13599
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
@@ -12519,7 +13603,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12519
13603
|
}
|
12520
13604
|
}
|
12521
13605
|
|
12522
|
-
for (uint32_t i = 0; i <
|
13606
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
12523
13607
|
const auto & cell = kv_self.cells[i];
|
12524
13608
|
|
12525
13609
|
const llama_pos pos = cell.pos;
|
@@ -12567,27 +13651,25 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12567
13651
|
|
12568
13652
|
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
12569
13653
|
|
12570
|
-
GGML_ASSERT(ctx->
|
13654
|
+
GGML_ASSERT(ctx->logits_size >= logits_size);
|
12571
13655
|
|
12572
13656
|
if (logits_size) {
|
12573
|
-
ctx->logits
|
12574
|
-
|
12575
|
-
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
13657
|
+
memcpy(ctx->logits, inp, logits_size * sizeof(float));
|
12576
13658
|
inp += logits_size * sizeof(float);
|
12577
13659
|
}
|
12578
13660
|
}
|
12579
13661
|
|
12580
13662
|
// set embeddings
|
12581
13663
|
{
|
12582
|
-
size_t
|
13664
|
+
size_t embeddings_size;
|
12583
13665
|
|
12584
|
-
memcpy(&
|
13666
|
+
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
|
12585
13667
|
|
12586
|
-
GGML_ASSERT(ctx->
|
13668
|
+
GGML_ASSERT(ctx->embd_size == embeddings_size);
|
12587
13669
|
|
12588
|
-
if (
|
12589
|
-
memcpy(ctx->
|
12590
|
-
inp +=
|
13670
|
+
if (embeddings_size) {
|
13671
|
+
memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
|
13672
|
+
inp += embeddings_size * sizeof(float);
|
12591
13673
|
}
|
12592
13674
|
}
|
12593
13675
|
|
@@ -12595,12 +13677,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12595
13677
|
{
|
12596
13678
|
const auto & kv_self = ctx->kv_self;
|
12597
13679
|
const auto & hparams = ctx->model.hparams;
|
12598
|
-
const auto & cparams = ctx->cparams;
|
12599
13680
|
|
12600
13681
|
const uint32_t n_layer = hparams.n_layer;
|
12601
|
-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12602
|
-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12603
|
-
const uint32_t n_ctx = cparams.n_ctx;
|
13682
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
13683
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
12604
13684
|
|
12605
13685
|
size_t kv_buf_size;
|
12606
13686
|
uint32_t kv_head;
|
@@ -12621,9 +13701,19 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12621
13701
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
12622
13702
|
inp += k_size;
|
12623
13703
|
|
13704
|
+
if (kv_self.recurrent) {
|
13705
|
+
// v is contiguous for recurrent models
|
13706
|
+
// TODO: use other tensors for state models than k and v
|
13707
|
+
const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
|
13708
|
+
|
13709
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size);
|
13710
|
+
inp += v_size;
|
13711
|
+
continue;
|
13712
|
+
}
|
13713
|
+
|
12624
13714
|
// v is not contiguous, copy row by row
|
12625
13715
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12626
|
-
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,
|
13716
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
12627
13717
|
|
12628
13718
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12629
13719
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
@@ -12632,13 +13722,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12632
13722
|
}
|
12633
13723
|
}
|
12634
13724
|
|
13725
|
+
GGML_ASSERT(kv_self.size == kv_size);
|
13726
|
+
|
12635
13727
|
ctx->kv_self.head = kv_head;
|
12636
13728
|
ctx->kv_self.size = kv_size;
|
12637
13729
|
ctx->kv_self.used = kv_used;
|
12638
13730
|
|
12639
13731
|
ctx->kv_self.cells.resize(kv_size);
|
12640
13732
|
|
12641
|
-
for (uint32_t i = 0; i <
|
13733
|
+
for (uint32_t i = 0; i < kv_head; ++i) {
|
12642
13734
|
llama_pos pos;
|
12643
13735
|
size_t seq_id_size;
|
12644
13736
|
|
@@ -12654,6 +13746,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
12654
13746
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
12655
13747
|
}
|
12656
13748
|
}
|
13749
|
+
|
13750
|
+
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
13751
|
+
ctx->kv_self.cells[i].pos = -1;
|
13752
|
+
ctx->kv_self.cells[i].seq_id.clear();
|
13753
|
+
}
|
12657
13754
|
}
|
12658
13755
|
|
12659
13756
|
const size_t nread = inp - src;
|
@@ -12751,6 +13848,15 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
12751
13848
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
12752
13849
|
}
|
12753
13850
|
|
13851
|
+
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
13852
|
+
ctx->abort_callback = abort_callback;
|
13853
|
+
ctx->abort_callback_data = abort_callback_data;
|
13854
|
+
}
|
13855
|
+
|
13856
|
+
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
13857
|
+
ctx->cparams.causal_attn = causal_attn;
|
13858
|
+
}
|
13859
|
+
|
12754
13860
|
struct llama_batch llama_batch_get_one(
|
12755
13861
|
llama_token * tokens,
|
12756
13862
|
int32_t n_tokens,
|
@@ -12817,32 +13923,81 @@ int32_t llama_decode(
|
|
12817
13923
|
return ret;
|
12818
13924
|
}
|
12819
13925
|
|
13926
|
+
void llama_synchronize(struct llama_context * ctx) {
|
13927
|
+
ggml_backend_sched_synchronize(ctx->sched);
|
13928
|
+
|
13929
|
+
// FIXME: if multiple single tokens are evaluated without a synchronization,
|
13930
|
+
// the stats will be added to the prompt evaluation stats
|
13931
|
+
// this should only happen when using batch size 1 to evaluate a batch
|
13932
|
+
|
13933
|
+
// add the evaluation to the stats
|
13934
|
+
if (ctx->n_queued_tokens == 1) {
|
13935
|
+
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
13936
|
+
ctx->n_eval++;
|
13937
|
+
} else if (ctx->n_queued_tokens > 1) {
|
13938
|
+
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
13939
|
+
ctx->n_p_eval += ctx->n_queued_tokens;
|
13940
|
+
}
|
13941
|
+
|
13942
|
+
// get a more accurate load time, upon first eval
|
13943
|
+
if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
|
13944
|
+
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
13945
|
+
ctx->has_evaluated_once = true;
|
13946
|
+
}
|
13947
|
+
|
13948
|
+
ctx->n_queued_tokens = 0;
|
13949
|
+
ctx->t_compute_start_us = 0;
|
13950
|
+
}
|
13951
|
+
|
12820
13952
|
float * llama_get_logits(struct llama_context * ctx) {
|
12821
|
-
|
13953
|
+
llama_synchronize(ctx);
|
13954
|
+
|
13955
|
+
return ctx->logits;
|
12822
13956
|
}
|
12823
13957
|
|
12824
13958
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
12825
13959
|
assert(ctx->logits_valid.at(i));
|
12826
|
-
|
13960
|
+
|
13961
|
+
llama_synchronize(ctx);
|
13962
|
+
|
13963
|
+
return ctx->logits + i*ctx->model.hparams.n_vocab;
|
12827
13964
|
}
|
12828
13965
|
|
12829
13966
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
12830
|
-
|
13967
|
+
llama_synchronize(ctx);
|
13968
|
+
|
13969
|
+
return ctx->embd;
|
12831
13970
|
}
|
12832
13971
|
|
12833
13972
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
12834
|
-
|
13973
|
+
llama_synchronize(ctx);
|
13974
|
+
|
13975
|
+
return ctx->embd + i*ctx->model.hparams.n_embd;
|
13976
|
+
}
|
13977
|
+
|
13978
|
+
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
|
13979
|
+
llama_synchronize(ctx);
|
13980
|
+
|
13981
|
+
auto it = ctx->embd_seq.find(seq_id);
|
13982
|
+
if (it == ctx->embd_seq.end()) {
|
13983
|
+
return nullptr;
|
13984
|
+
}
|
13985
|
+
|
13986
|
+
return it->second.data();
|
12835
13987
|
}
|
12836
13988
|
|
12837
13989
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
13990
|
+
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12838
13991
|
return model->vocab.id_to_token[token].text.c_str();
|
12839
13992
|
}
|
12840
13993
|
|
12841
13994
|
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
13995
|
+
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12842
13996
|
return model->vocab.id_to_token[token].score;
|
12843
13997
|
}
|
12844
13998
|
|
12845
13999
|
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
14000
|
+
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
12846
14001
|
return model->vocab.id_to_token[token].type;
|
12847
14002
|
}
|
12848
14003
|
|
@@ -12887,12 +14042,12 @@ int32_t llama_tokenize(
|
|
12887
14042
|
const char * text,
|
12888
14043
|
int32_t text_len,
|
12889
14044
|
llama_token * tokens,
|
12890
|
-
int32_t
|
14045
|
+
int32_t n_tokens_max,
|
12891
14046
|
bool add_bos,
|
12892
14047
|
bool special) {
|
12893
14048
|
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
12894
14049
|
|
12895
|
-
if (
|
14050
|
+
if (n_tokens_max < (int) res.size()) {
|
12896
14051
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
12897
14052
|
return -((int) res.size());
|
12898
14053
|
}
|
@@ -12906,9 +14061,9 @@ int32_t llama_tokenize(
|
|
12906
14061
|
|
12907
14062
|
static std::string llama_decode_text(const std::string & text) {
|
12908
14063
|
std::string decoded_text;
|
12909
|
-
auto unicode_sequences =
|
12910
|
-
for (auto& unicode_sequence : unicode_sequences) {
|
12911
|
-
decoded_text +=
|
14064
|
+
auto unicode_sequences = unicode_cpts_from_utf8(text);
|
14065
|
+
for (auto & unicode_sequence : unicode_sequences) {
|
14066
|
+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
|
12912
14067
|
}
|
12913
14068
|
|
12914
14069
|
return decoded_text;
|
@@ -12933,7 +14088,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
12933
14088
|
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
12934
14089
|
std::string result = model->vocab.id_to_token[token].text;
|
12935
14090
|
if (length < (int) result.length()) {
|
12936
|
-
return -result.length();
|
14091
|
+
return -(int) result.length();
|
12937
14092
|
}
|
12938
14093
|
memcpy(buf, result.c_str(), result.length());
|
12939
14094
|
return result.length();
|
@@ -12968,7 +14123,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
12968
14123
|
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
12969
14124
|
std::string result = model->vocab.id_to_token[token].text;
|
12970
14125
|
if (length < (int) result.length()) {
|
12971
|
-
return -result.length();
|
14126
|
+
return -(int) result.length();
|
12972
14127
|
}
|
12973
14128
|
memcpy(buf, result.c_str(), result.length());
|
12974
14129
|
return result.length();
|
@@ -13005,7 +14160,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13005
14160
|
std::string & dest, bool add_ass) {
|
13006
14161
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
13007
14162
|
std::stringstream ss;
|
13008
|
-
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
14163
|
+
if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
|
13009
14164
|
// chatml template
|
13010
14165
|
for (auto message : chat) {
|
13011
14166
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
@@ -13013,7 +14168,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13013
14168
|
if (add_ass) {
|
13014
14169
|
ss << "<|im_start|>assistant\n";
|
13015
14170
|
}
|
13016
|
-
} else if (tmpl.find("[INST]") != std::string::npos) {
|
14171
|
+
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
|
13017
14172
|
// llama2 template and its variants
|
13018
14173
|
// [variant] support system message
|
13019
14174
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
@@ -13048,7 +14203,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13048
14203
|
}
|
13049
14204
|
}
|
13050
14205
|
// llama2 templates seem to not care about "add_generation_prompt"
|
13051
|
-
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
14206
|
+
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
13052
14207
|
// zephyr template
|
13053
14208
|
for (auto message : chat) {
|
13054
14209
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
@@ -13056,7 +14211,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13056
14211
|
if (add_ass) {
|
13057
14212
|
ss << "<|assistant|>\n";
|
13058
14213
|
}
|
13059
|
-
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
14214
|
+
} else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
|
13060
14215
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
13061
14216
|
for (auto message : chat) {
|
13062
14217
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
@@ -13065,7 +14220,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
13065
14220
|
if (add_ass) {
|
13066
14221
|
ss << "<s>assistant\n";
|
13067
14222
|
}
|
13068
|
-
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
14223
|
+
} else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
13069
14224
|
// google/gemma-7b-it
|
13070
14225
|
std::string system_prompt = "";
|
13071
14226
|
for (auto message : chat) {
|
@@ -13087,6 +14242,26 @@ static int32_t llama_chat_apply_template_internal(
|
|
13087
14242
|
if (add_ass) {
|
13088
14243
|
ss << "<start_of_turn>model\n";
|
13089
14244
|
}
|
14245
|
+
} else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
|
14246
|
+
// OrionStarAI/Orion-14B-Chat
|
14247
|
+
std::string system_prompt = "";
|
14248
|
+
for (auto message : chat) {
|
14249
|
+
std::string role(message->role);
|
14250
|
+
if (role == "system") {
|
14251
|
+
// there is no system message support, we will merge it with user prompt
|
14252
|
+
system_prompt = message->content;
|
14253
|
+
continue;
|
14254
|
+
} else if (role == "user") {
|
14255
|
+
ss << "Human: ";
|
14256
|
+
if (!system_prompt.empty()) {
|
14257
|
+
ss << system_prompt << "\n\n";
|
14258
|
+
system_prompt = "";
|
14259
|
+
}
|
14260
|
+
ss << message->content << "\n\nAssistant: </s>";
|
14261
|
+
} else {
|
14262
|
+
ss << message->content << "</s>";
|
14263
|
+
}
|
14264
|
+
}
|
13090
14265
|
} else {
|
13091
14266
|
// template not supported
|
13092
14267
|
return -1;
|
@@ -13112,23 +14287,27 @@ LLAMA_API int32_t llama_chat_apply_template(
|
|
13112
14287
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
13113
14288
|
if (res < 0) {
|
13114
14289
|
// worst case: there is no information about template, we will use chatml by default
|
13115
|
-
curr_tmpl = "
|
14290
|
+
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
13116
14291
|
} else {
|
13117
14292
|
curr_tmpl = std::string(model_template.data(), model_template.size());
|
13118
14293
|
}
|
13119
14294
|
}
|
14295
|
+
|
13120
14296
|
// format the chat to string
|
13121
14297
|
std::vector<const llama_chat_message *> chat_vec;
|
13122
14298
|
chat_vec.resize(n_msg);
|
13123
14299
|
for (size_t i = 0; i < n_msg; i++) {
|
13124
14300
|
chat_vec[i] = &chat[i];
|
13125
14301
|
}
|
14302
|
+
|
13126
14303
|
std::string formatted_chat;
|
13127
14304
|
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
13128
14305
|
if (res < 0) {
|
13129
14306
|
return res;
|
13130
14307
|
}
|
13131
|
-
|
14308
|
+
if (buf && length > 0) {
|
14309
|
+
strncpy(buf, formatted_chat.c_str(), length);
|
14310
|
+
}
|
13132
14311
|
return res;
|
13133
14312
|
}
|
13134
14313
|
|