@fugood/llama.node 1.0.0-beta.4 → 1.0.0-beta.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -4
- package/lib/binding.ts +1 -1
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +27 -26
- package/src/LlamaCompletionWorker.cpp +21 -4
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +3 -12
- package/src/common.hpp +6 -5
- package/src/llama.cpp/CMakeLists.txt +15 -4
- package/src/llama.cpp/common/CMakeLists.txt +15 -24
- package/src/llama.cpp/common/arg.cpp +172 -110
- package/src/llama.cpp/common/chat-parser.cpp +385 -0
- package/src/llama.cpp/common/chat-parser.h +120 -0
- package/src/llama.cpp/common/chat.cpp +726 -596
- package/src/llama.cpp/common/chat.h +74 -8
- package/src/llama.cpp/common/common.cpp +56 -38
- package/src/llama.cpp/common/common.h +9 -3
- package/src/llama.cpp/common/json-partial.cpp +256 -0
- package/src/llama.cpp/common/json-partial.h +38 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/src/llama.cpp/common/sampling.cpp +7 -8
- package/src/llama.cpp/common/speculative.cpp +6 -4
- package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
- package/src/llama.cpp/ggml/include/ggml.h +22 -3
- package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
- package/src/llama.cpp/include/llama.h +145 -40
- package/src/llama.cpp/src/CMakeLists.txt +5 -1
- package/src/llama.cpp/src/llama-arch.cpp +99 -3
- package/src/llama.cpp/src/llama-arch.h +10 -1
- package/src/llama.cpp/src/llama-batch.cpp +728 -272
- package/src/llama.cpp/src/llama-batch.h +112 -54
- package/src/llama.cpp/src/llama-chat.cpp +19 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +525 -339
- package/src/llama.cpp/src/llama-context.h +38 -17
- package/src/llama.cpp/src/llama-cparams.cpp +4 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-grammar.cpp +12 -2
- package/src/llama.cpp/src/llama-graph.cpp +413 -353
- package/src/llama.cpp/src/llama-graph.h +112 -56
- package/src/llama.cpp/src/llama-hparams.cpp +10 -2
- package/src/llama.cpp/src/llama-hparams.h +13 -2
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
- package/src/llama.cpp/src/llama-kv-cells.h +415 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
- package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
- package/src/llama.cpp/src/llama-memory.cpp +41 -0
- package/src/llama.cpp/src/llama-memory.h +86 -5
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +1137 -528
- package/src/llama.cpp/src/llama-model.h +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +69 -32
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +11 -7
- package/src/llama.cpp/src/unicode.cpp +5 -0
- package/src/tts_utils.h +1 -1
- package/src/llama.cpp/common/json.hpp +0 -24766
- package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
- package/src/llama.cpp/common/minja/minja.hpp +0 -2974
- package/src/llama.cpp/common/stb_image.h +0 -7988
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
- package/src/llama.cpp/src/llama-kv-cache.h +0 -515
- /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -61,7 +61,10 @@ extern "C" {
|
|
|
61
61
|
struct llama_model;
|
|
62
62
|
struct llama_context;
|
|
63
63
|
struct llama_sampler;
|
|
64
|
-
|
|
64
|
+
|
|
65
|
+
typedef struct llama_memory_i * llama_memory_t;
|
|
66
|
+
|
|
67
|
+
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
|
65
68
|
|
|
66
69
|
typedef int32_t llama_pos;
|
|
67
70
|
typedef int32_t llama_token;
|
|
@@ -240,18 +243,21 @@ extern "C" {
|
|
|
240
243
|
|
|
241
244
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
|
242
245
|
|
|
243
|
-
// Input data for llama_decode
|
|
246
|
+
// Input data for llama_encode/llama_decode
|
|
244
247
|
// A llama_batch object can contain input about one or many sequences
|
|
245
248
|
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
|
246
249
|
//
|
|
247
250
|
// - token : the token ids of the input (used when embd is NULL)
|
|
248
251
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
|
249
252
|
// - pos : the positions of the respective token in the sequence
|
|
250
|
-
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
|
253
|
+
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
|
251
254
|
// - seq_id : the sequence to which the respective token belongs
|
|
252
255
|
// (if set to NULL, the sequence ID will be assumed to be 0)
|
|
253
256
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
|
254
|
-
// (if set to NULL
|
|
257
|
+
// (if set to NULL:
|
|
258
|
+
// - if embeddings: all tokens are output
|
|
259
|
+
// - if not: only the last token is output
|
|
260
|
+
// )
|
|
255
261
|
//
|
|
256
262
|
typedef struct llama_batch {
|
|
257
263
|
int32_t n_tokens;
|
|
@@ -261,7 +267,7 @@ extern "C" {
|
|
|
261
267
|
llama_pos * pos;
|
|
262
268
|
int32_t * n_seq_id;
|
|
263
269
|
llama_seq_id ** seq_id;
|
|
264
|
-
int8_t * logits;
|
|
270
|
+
int8_t * logits; // TODO: rename this to "output"
|
|
265
271
|
} llama_batch;
|
|
266
272
|
|
|
267
273
|
enum llama_model_kv_override_type {
|
|
@@ -366,6 +372,8 @@ extern "C" {
|
|
|
366
372
|
bool no_perf; // measure performance timings
|
|
367
373
|
bool op_offload; // offload host tensor operations to device
|
|
368
374
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
375
|
+
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
376
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
369
377
|
};
|
|
370
378
|
|
|
371
379
|
// model quantization parameters
|
|
@@ -471,6 +479,7 @@ extern "C" {
|
|
|
471
479
|
LLAMA_API int64_t llama_time_us(void);
|
|
472
480
|
|
|
473
481
|
LLAMA_API size_t llama_max_devices(void);
|
|
482
|
+
LLAMA_API size_t llama_max_parallel_sequences(void);
|
|
474
483
|
|
|
475
484
|
LLAMA_API bool llama_supports_mmap (void);
|
|
476
485
|
LLAMA_API bool llama_supports_mlock (void);
|
|
@@ -490,9 +499,11 @@ extern "C" {
|
|
|
490
499
|
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
|
491
500
|
|
|
492
501
|
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
|
493
|
-
LLAMA_API
|
|
502
|
+
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
|
494
503
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
|
495
504
|
|
|
505
|
+
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
|
506
|
+
|
|
496
507
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
497
508
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
498
509
|
|
|
@@ -501,10 +512,18 @@ extern "C" {
|
|
|
501
512
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
502
513
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
503
514
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
515
|
+
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
|
504
516
|
|
|
505
517
|
// Get the model's RoPE frequency scaling factor
|
|
506
518
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
507
519
|
|
|
520
|
+
// Returns the number of classifier outputs (only valid for classifier models)
|
|
521
|
+
// Undefined behavior for non-classifier models
|
|
522
|
+
LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
|
|
523
|
+
|
|
524
|
+
// Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
|
|
525
|
+
LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
|
|
526
|
+
|
|
508
527
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
|
509
528
|
|
|
510
529
|
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
|
@@ -605,106 +624,190 @@ extern "C" {
|
|
|
605
624
|
int32_t il_end);
|
|
606
625
|
|
|
607
626
|
//
|
|
608
|
-
//
|
|
627
|
+
// Memory
|
|
628
|
+
//
|
|
629
|
+
|
|
630
|
+
// Clear the memory contents
|
|
631
|
+
// If data == true, the data buffers will also be cleared together with the metadata
|
|
632
|
+
LLAMA_API void llama_memory_clear(
|
|
633
|
+
llama_memory_t mem,
|
|
634
|
+
bool data);
|
|
635
|
+
|
|
636
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
637
|
+
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
638
|
+
// seq_id < 0 : match any sequence
|
|
639
|
+
// p0 < 0 : [0, p1]
|
|
640
|
+
// p1 < 0 : [p0, inf)
|
|
641
|
+
LLAMA_API bool llama_memory_seq_rm(
|
|
642
|
+
llama_memory_t mem,
|
|
643
|
+
llama_seq_id seq_id,
|
|
644
|
+
llama_pos p0,
|
|
645
|
+
llama_pos p1);
|
|
646
|
+
|
|
647
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
|
648
|
+
// p0 < 0 : [0, p1]
|
|
649
|
+
// p1 < 0 : [p0, inf)
|
|
650
|
+
LLAMA_API void llama_memory_seq_cp(
|
|
651
|
+
llama_memory_t mem,
|
|
652
|
+
llama_seq_id seq_id_src,
|
|
653
|
+
llama_seq_id seq_id_dst,
|
|
654
|
+
llama_pos p0,
|
|
655
|
+
llama_pos p1);
|
|
656
|
+
|
|
657
|
+
// Removes all tokens that do not belong to the specified sequence
|
|
658
|
+
LLAMA_API void llama_memory_seq_keep(
|
|
659
|
+
llama_memory_t mem,
|
|
660
|
+
llama_seq_id seq_id);
|
|
661
|
+
|
|
662
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
663
|
+
// p0 < 0 : [0, p1]
|
|
664
|
+
// p1 < 0 : [p0, inf)
|
|
665
|
+
LLAMA_API void llama_memory_seq_add(
|
|
666
|
+
llama_memory_t mem,
|
|
667
|
+
llama_seq_id seq_id,
|
|
668
|
+
llama_pos p0,
|
|
669
|
+
llama_pos p1,
|
|
670
|
+
llama_pos delta);
|
|
671
|
+
|
|
672
|
+
// Integer division of the positions by factor of `d > 1`
|
|
673
|
+
// p0 < 0 : [0, p1]
|
|
674
|
+
// p1 < 0 : [p0, inf)
|
|
675
|
+
LLAMA_API void llama_memory_seq_div(
|
|
676
|
+
llama_memory_t mem,
|
|
677
|
+
llama_seq_id seq_id,
|
|
678
|
+
llama_pos p0,
|
|
679
|
+
llama_pos p1,
|
|
680
|
+
int d);
|
|
681
|
+
|
|
682
|
+
// Returns the smallest position present in the memory for the specified sequence
|
|
683
|
+
// This is typically non-zero only for SWA caches
|
|
684
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
|
685
|
+
// Return -1 if the sequence is empty
|
|
686
|
+
LLAMA_API llama_pos llama_memory_seq_pos_min(
|
|
687
|
+
llama_memory_t mem,
|
|
688
|
+
llama_seq_id seq_id);
|
|
689
|
+
|
|
690
|
+
// Returns the largest position present in the memory for the specified sequence
|
|
691
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
|
692
|
+
// Return -1 if the sequence is empty
|
|
693
|
+
LLAMA_API llama_pos llama_memory_seq_pos_max(
|
|
694
|
+
llama_memory_t mem,
|
|
695
|
+
llama_seq_id seq_id);
|
|
696
|
+
|
|
697
|
+
// Check if the memory supports shifting
|
|
698
|
+
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
|
699
|
+
|
|
700
|
+
//
|
|
701
|
+
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
|
609
702
|
//
|
|
610
703
|
|
|
611
704
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
612
705
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
613
706
|
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
|
614
|
-
"Use llama_kv_self_seq_pos_max() instead");
|
|
707
|
+
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
615
708
|
|
|
616
709
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
617
710
|
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
|
618
|
-
"Use llama_kv_self_seq_pos_max() instead");
|
|
711
|
+
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
619
712
|
|
|
620
713
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
621
|
-
LLAMA_API void llama_kv_self_clear(
|
|
622
|
-
|
|
714
|
+
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
|
715
|
+
struct llama_context * ctx),
|
|
716
|
+
"Use llama_memory_clear() instead");
|
|
623
717
|
|
|
624
718
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
625
719
|
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
626
720
|
// seq_id < 0 : match any sequence
|
|
627
721
|
// p0 < 0 : [0, p1]
|
|
628
722
|
// p1 < 0 : [p0, inf)
|
|
629
|
-
LLAMA_API bool llama_kv_self_seq_rm(
|
|
723
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
|
630
724
|
struct llama_context * ctx,
|
|
631
725
|
llama_seq_id seq_id,
|
|
632
726
|
llama_pos p0,
|
|
633
|
-
llama_pos p1)
|
|
727
|
+
llama_pos p1),
|
|
728
|
+
"Use llama_memory_seq_rm() instead");
|
|
634
729
|
|
|
635
730
|
// Copy all tokens that belong to the specified sequence to another sequence
|
|
636
731
|
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
637
732
|
// p0 < 0 : [0, p1]
|
|
638
733
|
// p1 < 0 : [p0, inf)
|
|
639
|
-
LLAMA_API void llama_kv_self_seq_cp(
|
|
734
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
|
640
735
|
struct llama_context * ctx,
|
|
641
736
|
llama_seq_id seq_id_src,
|
|
642
737
|
llama_seq_id seq_id_dst,
|
|
643
738
|
llama_pos p0,
|
|
644
|
-
llama_pos p1)
|
|
739
|
+
llama_pos p1),
|
|
740
|
+
"Use llama_memory_seq_cp() instead");
|
|
645
741
|
|
|
646
742
|
// Removes all tokens that do not belong to the specified sequence
|
|
647
|
-
LLAMA_API void llama_kv_self_seq_keep(
|
|
743
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
|
648
744
|
struct llama_context * ctx,
|
|
649
|
-
llama_seq_id seq_id)
|
|
745
|
+
llama_seq_id seq_id),
|
|
746
|
+
"Use llama_memory_seq_keep() instead");
|
|
650
747
|
|
|
651
748
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
652
749
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
653
750
|
// - lazily on next llama_decode()
|
|
654
|
-
// - explicitly with llama_kv_self_update()
|
|
655
751
|
// p0 < 0 : [0, p1]
|
|
656
752
|
// p1 < 0 : [p0, inf)
|
|
657
|
-
LLAMA_API void llama_kv_self_seq_add(
|
|
753
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
|
658
754
|
struct llama_context * ctx,
|
|
659
755
|
llama_seq_id seq_id,
|
|
660
756
|
llama_pos p0,
|
|
661
757
|
llama_pos p1,
|
|
662
|
-
llama_pos delta)
|
|
758
|
+
llama_pos delta),
|
|
759
|
+
"Use llama_memory_seq_add() instead");
|
|
663
760
|
|
|
664
761
|
// Integer division of the positions by factor of `d > 1`
|
|
665
762
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
666
763
|
// - lazily on next llama_decode()
|
|
667
|
-
// - explicitly with llama_kv_self_update()
|
|
668
764
|
// p0 < 0 : [0, p1]
|
|
669
765
|
// p1 < 0 : [p0, inf)
|
|
670
|
-
|
|
766
|
+
DEPRECATED(void llama_kv_self_seq_div(
|
|
671
767
|
struct llama_context * ctx,
|
|
672
768
|
llama_seq_id seq_id,
|
|
673
769
|
llama_pos p0,
|
|
674
770
|
llama_pos p1,
|
|
675
|
-
int d)
|
|
771
|
+
int d),
|
|
772
|
+
"Use llama_memory_seq_div() instead");
|
|
676
773
|
|
|
677
774
|
// Returns the smallest position present in the KV cache for the specified sequence
|
|
678
775
|
// This is typically non-zero only for SWA caches
|
|
776
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
679
777
|
// Return -1 if the sequence is empty
|
|
680
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
778
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
681
779
|
struct llama_context * ctx,
|
|
682
|
-
llama_seq_id seq_id)
|
|
780
|
+
llama_seq_id seq_id),
|
|
781
|
+
"Use llama_memory_seq_pos_min() instead");
|
|
683
782
|
|
|
684
783
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
784
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
685
785
|
// Return -1 if the sequence is empty
|
|
686
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
786
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
687
787
|
struct llama_context * ctx,
|
|
688
|
-
llama_seq_id seq_id)
|
|
788
|
+
llama_seq_id seq_id),
|
|
789
|
+
"Use llama_memory_seq_pos_max() instead");
|
|
689
790
|
|
|
690
791
|
// Defragment the KV cache
|
|
691
792
|
// This will be applied:
|
|
692
793
|
// - lazily on next llama_decode()
|
|
693
|
-
|
|
694
|
-
|
|
794
|
+
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
|
795
|
+
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
695
796
|
|
|
696
797
|
// Check if the context supports KV cache shifting
|
|
697
|
-
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx)
|
|
798
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
|
799
|
+
"use llama_memory_can_shift() instead");
|
|
698
800
|
|
|
699
801
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
700
|
-
LLAMA_API void llama_kv_self_update(struct llama_context * ctx)
|
|
802
|
+
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
|
803
|
+
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
701
804
|
|
|
702
805
|
//
|
|
703
806
|
// State / sessions
|
|
704
807
|
//
|
|
705
808
|
|
|
706
809
|
// Returns the *actual* size in bytes of the state
|
|
707
|
-
// (logits, embedding and
|
|
810
|
+
// (logits, embedding and memory)
|
|
708
811
|
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
|
|
709
812
|
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
|
|
710
813
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
|
|
@@ -760,12 +863,12 @@ extern "C" {
|
|
|
760
863
|
size_t n_token_count),
|
|
761
864
|
"use llama_state_save_file instead");
|
|
762
865
|
|
|
763
|
-
// Get the exact size needed to copy the
|
|
866
|
+
// Get the exact size needed to copy the state of a single sequence
|
|
764
867
|
LLAMA_API size_t llama_state_seq_get_size(
|
|
765
868
|
struct llama_context * ctx,
|
|
766
869
|
llama_seq_id seq_id);
|
|
767
870
|
|
|
768
|
-
// Copy the
|
|
871
|
+
// Copy the state of a single sequence into the specified buffer
|
|
769
872
|
LLAMA_API size_t llama_state_seq_get_data(
|
|
770
873
|
struct llama_context * ctx,
|
|
771
874
|
uint8_t * dst,
|
|
@@ -831,16 +934,16 @@ extern "C" {
|
|
|
831
934
|
// For encode-decoder contexts, processes the batch using the encoder.
|
|
832
935
|
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
|
833
936
|
// 0 - success
|
|
834
|
-
// < 0 - error. the
|
|
937
|
+
// < 0 - error. the memory state is restored to the state before this call
|
|
835
938
|
LLAMA_API int32_t llama_encode(
|
|
836
939
|
struct llama_context * ctx,
|
|
837
940
|
struct llama_batch batch);
|
|
838
941
|
|
|
839
942
|
// Process a batch of tokens.
|
|
840
|
-
// Requires
|
|
943
|
+
// Requires the context to have a memory.
|
|
841
944
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
842
945
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
843
|
-
// Upon non-zero return values, the
|
|
946
|
+
// Upon non-zero return values, the memory state is restored to the state before this call
|
|
844
947
|
// 0 - success
|
|
845
948
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
846
949
|
// 2 - aborted
|
|
@@ -861,8 +964,8 @@ extern "C" {
|
|
|
861
964
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
|
862
965
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
|
863
966
|
|
|
864
|
-
// Set whether the
|
|
865
|
-
//
|
|
967
|
+
// Set whether the context outputs embeddings or not
|
|
968
|
+
// TODO: rename to avoid confusion with llama_get_embeddings()
|
|
866
969
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
|
867
970
|
|
|
868
971
|
// Set whether to use causal attention or not
|
|
@@ -911,7 +1014,7 @@ extern "C" {
|
|
|
911
1014
|
|
|
912
1015
|
// Get the embeddings for a sequence id
|
|
913
1016
|
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
|
914
|
-
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[
|
|
1017
|
+
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
|
|
915
1018
|
// otherwise: float[n_embd] (1-dimensional)
|
|
916
1019
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
|
917
1020
|
|
|
@@ -941,6 +1044,7 @@ extern "C" {
|
|
|
941
1044
|
|
|
942
1045
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
943
1046
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
1047
|
+
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
|
944
1048
|
|
|
945
1049
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
|
946
1050
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
|
@@ -984,6 +1088,7 @@ extern "C" {
|
|
|
984
1088
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
985
1089
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
|
986
1090
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
|
1091
|
+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
|
987
1092
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
|
988
1093
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
|
989
1094
|
/// as plaintext. Does not insert a leading space.
|
|
@@ -14,13 +14,17 @@ add_library(llama
|
|
|
14
14
|
llama-batch.cpp
|
|
15
15
|
llama-chat.cpp
|
|
16
16
|
llama-context.cpp
|
|
17
|
+
llama-cparams.cpp
|
|
17
18
|
llama-grammar.cpp
|
|
18
19
|
llama-graph.cpp
|
|
19
20
|
llama-hparams.cpp
|
|
20
21
|
llama-impl.cpp
|
|
21
22
|
llama-io.cpp
|
|
22
|
-
llama-kv-cache.cpp
|
|
23
|
+
llama-kv-cache-unified.cpp
|
|
24
|
+
llama-kv-cache-unified-iswa.cpp
|
|
23
25
|
llama-memory.cpp
|
|
26
|
+
llama-memory-hybrid.cpp
|
|
27
|
+
llama-memory-recurrent.cpp
|
|
24
28
|
llama-mmap.cpp
|
|
25
29
|
llama-model-loader.cpp
|
|
26
30
|
llama-model-saver.cpp
|
|
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
20
20
|
{ LLM_ARCH_BERT, "bert" },
|
|
21
21
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
22
22
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
|
23
|
+
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
|
23
24
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
|
24
25
|
{ LLM_ARCH_BLOOM, "bloom" },
|
|
25
26
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
@@ -72,6 +73,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
72
73
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
|
73
74
|
{ LLM_ARCH_PLM, "plm" },
|
|
74
75
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
|
76
|
+
{ LLM_ARCH_DOTS1, "dots1" },
|
|
77
|
+
{ LLM_ARCH_ARCEE, "arcee" },
|
|
75
78
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
76
79
|
};
|
|
77
80
|
|
|
@@ -144,6 +147,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
144
147
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
145
148
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
|
146
149
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
150
|
+
{ LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
|
|
147
151
|
|
|
148
152
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
149
153
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
@@ -174,6 +178,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
174
178
|
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
|
|
175
179
|
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
|
|
176
180
|
|
|
181
|
+
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
|
|
182
|
+
|
|
177
183
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
|
178
184
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
|
179
185
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
|
@@ -192,13 +198,13 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
192
198
|
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
|
193
199
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
|
194
200
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
|
201
|
+
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
|
|
195
202
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
|
196
203
|
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
|
|
197
204
|
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
|
198
205
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
|
199
206
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
|
200
207
|
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
|
201
|
-
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE_N, "tokenizer.chat_template.%s" },
|
|
202
208
|
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
|
203
209
|
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
|
204
210
|
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
|
@@ -242,6 +248,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
242
248
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
243
249
|
},
|
|
244
250
|
},
|
|
251
|
+
{
|
|
252
|
+
LLM_ARCH_ARCEE,
|
|
253
|
+
{
|
|
254
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
255
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
256
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
257
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
258
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
259
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
260
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
261
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
262
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
263
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
264
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
265
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
266
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
267
|
+
},
|
|
268
|
+
},
|
|
245
269
|
{
|
|
246
270
|
LLM_ARCH_LLAMA4,
|
|
247
271
|
{
|
|
@@ -448,6 +472,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
448
472
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
449
473
|
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
|
450
474
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
475
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
451
476
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
452
477
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
453
478
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
@@ -492,6 +517,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
492
517
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
493
518
|
},
|
|
494
519
|
},
|
|
520
|
+
{
|
|
521
|
+
LLM_ARCH_NEO_BERT,
|
|
522
|
+
{
|
|
523
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
524
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
525
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
526
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
527
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
528
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
529
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
530
|
+
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
|
531
|
+
{ LLM_TENSOR_CLS, "cls" },
|
|
532
|
+
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
|
533
|
+
},
|
|
534
|
+
},
|
|
495
535
|
{
|
|
496
536
|
LLM_ARCH_JINA_BERT_V2,
|
|
497
537
|
{
|
|
@@ -1553,6 +1593,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1553
1593
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1554
1594
|
},
|
|
1555
1595
|
},
|
|
1596
|
+
{
|
|
1597
|
+
LLM_ARCH_DOTS1,
|
|
1598
|
+
{
|
|
1599
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1600
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1601
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1602
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1603
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1604
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1605
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1606
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1607
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1608
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1609
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1610
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1611
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1612
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1613
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1614
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1615
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1616
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1617
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
|
1618
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1619
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1620
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1621
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1622
|
+
}
|
|
1623
|
+
},
|
|
1556
1624
|
{
|
|
1557
1625
|
LLM_ARCH_UNKNOWN,
|
|
1558
1626
|
{
|
|
@@ -1704,8 +1772,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1704
1772
|
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
|
1705
1773
|
|
|
1706
1774
|
std::string LLM_KV::operator()(llm_kv kv) const {
|
|
1707
|
-
|
|
1708
|
-
|
|
1775
|
+
std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
|
1776
|
+
|
|
1777
|
+
if (suffix != nullptr) {
|
|
1778
|
+
name += ".";
|
|
1779
|
+
name += suffix;
|
|
1780
|
+
}
|
|
1781
|
+
|
|
1782
|
+
return name;
|
|
1709
1783
|
}
|
|
1710
1784
|
|
|
1711
1785
|
std::string LLM_TN_IMPL::str() const {
|
|
@@ -1744,3 +1818,25 @@ llm_arch llm_arch_from_string(const std::string & name) {
|
|
|
1744
1818
|
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
|
|
1745
1819
|
return LLM_TENSOR_INFOS.at(tensor);
|
|
1746
1820
|
}
|
|
1821
|
+
|
|
1822
|
+
bool llm_arch_is_recurrent(const llm_arch & arch) {
|
|
1823
|
+
switch (arch) {
|
|
1824
|
+
case LLM_ARCH_MAMBA:
|
|
1825
|
+
case LLM_ARCH_RWKV6:
|
|
1826
|
+
case LLM_ARCH_RWKV6QWEN2:
|
|
1827
|
+
case LLM_ARCH_RWKV7:
|
|
1828
|
+
case LLM_ARCH_ARWKV7:
|
|
1829
|
+
return true;
|
|
1830
|
+
default:
|
|
1831
|
+
return false;
|
|
1832
|
+
}
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
1836
|
+
// TODO: There are currently no hybrid models! Once there are, this will be
|
|
1837
|
+
// the place to identify them
|
|
1838
|
+
switch (arch) {
|
|
1839
|
+
default:
|
|
1840
|
+
return false;
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
@@ -24,6 +24,7 @@ enum llm_arch {
|
|
|
24
24
|
LLM_ARCH_BERT,
|
|
25
25
|
LLM_ARCH_NOMIC_BERT,
|
|
26
26
|
LLM_ARCH_NOMIC_BERT_MOE,
|
|
27
|
+
LLM_ARCH_NEO_BERT,
|
|
27
28
|
LLM_ARCH_JINA_BERT_V2,
|
|
28
29
|
LLM_ARCH_BLOOM,
|
|
29
30
|
LLM_ARCH_STABLELM,
|
|
@@ -76,6 +77,8 @@ enum llm_arch {
|
|
|
76
77
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
77
78
|
LLM_ARCH_PLM,
|
|
78
79
|
LLM_ARCH_BAILINGMOE,
|
|
80
|
+
LLM_ARCH_DOTS1,
|
|
81
|
+
LLM_ARCH_ARCEE,
|
|
79
82
|
LLM_ARCH_UNKNOWN,
|
|
80
83
|
};
|
|
81
84
|
|
|
@@ -148,6 +151,7 @@ enum llm_kv {
|
|
|
148
151
|
LLM_KV_ATTENTION_SCALE,
|
|
149
152
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
150
153
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
154
|
+
LLM_KV_ATTENTION_LAYER_INDICES,
|
|
151
155
|
|
|
152
156
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
153
157
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
@@ -190,13 +194,13 @@ enum llm_kv {
|
|
|
190
194
|
LLM_KV_TOKENIZER_MASK_ID,
|
|
191
195
|
LLM_KV_TOKENIZER_ADD_BOS,
|
|
192
196
|
LLM_KV_TOKENIZER_ADD_EOS,
|
|
197
|
+
LLM_KV_TOKENIZER_ADD_SEP,
|
|
193
198
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
|
194
199
|
LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
|
|
195
200
|
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
|
196
201
|
LLM_KV_TOKENIZER_HF_JSON,
|
|
197
202
|
LLM_KV_TOKENIZER_RWKV,
|
|
198
203
|
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
|
199
|
-
LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
|
|
200
204
|
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
|
201
205
|
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
|
202
206
|
LLM_KV_TOKENIZER_FIM_MID_ID,
|
|
@@ -213,6 +217,8 @@ enum llm_kv {
|
|
|
213
217
|
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
|
|
214
218
|
LLM_KV_CONVNEXT_BLOCK_COUNT,
|
|
215
219
|
|
|
220
|
+
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
|
|
221
|
+
|
|
216
222
|
// deprecated:
|
|
217
223
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
|
218
224
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
|
@@ -435,3 +441,6 @@ const char * llm_arch_name(llm_arch arch);
|
|
|
435
441
|
llm_arch llm_arch_from_string(const std::string & name);
|
|
436
442
|
|
|
437
443
|
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
|
444
|
+
|
|
445
|
+
bool llm_arch_is_recurrent(const llm_arch & arch);
|
|
446
|
+
bool llm_arch_is_hybrid (const llm_arch & arch);
|