@fugood/llama.node 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +31 -31
- package/src/llama.cpp/common/arg.cpp +128 -59
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +36 -7
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +42 -23
- package/src/llama.cpp/common/common.h +11 -1
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +100 -12
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +78 -0
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +637 -49
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +12 -5
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +337 -26
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
- package/src/llama.cpp/src/llama-sampling.h +19 -7
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +87 -64
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -286,7 +286,7 @@ extern "C" {
|
|
|
286
286
|
// NULL-terminated list of buffer types to use for tensors that match a pattern
|
|
287
287
|
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
|
|
288
288
|
|
|
289
|
-
int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
289
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
|
|
290
290
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
|
291
291
|
|
|
292
292
|
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
|
@@ -316,6 +316,11 @@ extern "C" {
|
|
|
316
316
|
bool no_alloc; // only load metadata and simulate memory allocations
|
|
317
317
|
};
|
|
318
318
|
|
|
319
|
+
struct llama_sampler_seq_config {
|
|
320
|
+
llama_seq_id seq_id;
|
|
321
|
+
struct llama_sampler * sampler;
|
|
322
|
+
};
|
|
323
|
+
|
|
319
324
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
320
325
|
// https://github.com/ggml-org/llama.cpp/pull/7544
|
|
321
326
|
struct llama_context_params {
|
|
@@ -364,6 +369,12 @@ extern "C" {
|
|
|
364
369
|
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
|
365
370
|
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
|
366
371
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
372
|
+
|
|
373
|
+
// [EXPERIMENTAL]
|
|
374
|
+
// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
|
|
375
|
+
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
|
|
376
|
+
struct llama_sampler_seq_config * samplers;
|
|
377
|
+
size_t n_samplers;
|
|
367
378
|
};
|
|
368
379
|
|
|
369
380
|
// model quantization parameters
|
|
@@ -467,10 +478,17 @@ extern "C" {
|
|
|
467
478
|
// Frees all allocated memory
|
|
468
479
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
|
469
480
|
|
|
481
|
+
enum llama_params_fit_status {
|
|
482
|
+
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
|
483
|
+
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
|
484
|
+
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
|
|
485
|
+
};
|
|
486
|
+
|
|
470
487
|
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
|
471
|
-
// returns true if the parameters could be successfully modified to fit device memory
|
|
472
|
-
// this function is NOT thread safe because it modifies the global llama logger state
|
|
473
|
-
|
|
488
|
+
// - returns true if the parameters could be successfully modified to fit device memory
|
|
489
|
+
// - this function is NOT thread safe because it modifies the global llama logger state
|
|
490
|
+
// - only parameters that have the same value as in llama_default_model_params are modified
|
|
491
|
+
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
|
474
492
|
const char * path_model,
|
|
475
493
|
struct llama_model_params * mparams,
|
|
476
494
|
struct llama_context_params * cparams,
|
|
@@ -517,6 +535,7 @@ extern "C" {
|
|
|
517
535
|
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
|
518
536
|
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
|
519
537
|
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
|
538
|
+
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
|
|
520
539
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
521
540
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
522
541
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
@@ -600,6 +619,8 @@ extern "C" {
|
|
|
600
619
|
//
|
|
601
620
|
|
|
602
621
|
// Load a LoRA adapter from file
|
|
622
|
+
// The adapter is valid as long as the associated model is not freed
|
|
623
|
+
// All adapters must be loaded before context creation
|
|
603
624
|
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
|
604
625
|
struct llama_model * model,
|
|
605
626
|
const char * path_lora);
|
|
@@ -983,6 +1004,32 @@ extern "C" {
|
|
|
983
1004
|
// otherwise: float[n_embd] (1-dimensional)
|
|
984
1005
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
|
985
1006
|
|
|
1007
|
+
//
|
|
1008
|
+
// backend sampling API [EXPERIMENTAL]
|
|
1009
|
+
// note: use only if the llama_context was created with at least one llama_sampler_seq_config
|
|
1010
|
+
//
|
|
1011
|
+
|
|
1012
|
+
// Get the backend sampled token for the ith token.
|
|
1013
|
+
// Returns LLAMA_TOKEN_NULL if no token was sampled.
|
|
1014
|
+
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
|
|
1015
|
+
|
|
1016
|
+
// Get the backend sampled probabilites for the ith token
|
|
1017
|
+
// The index matches llama_get_sampled_token_ith().
|
|
1018
|
+
// Returns NULL if no probabilites were generated.
|
|
1019
|
+
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
|
|
1020
|
+
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
|
|
1021
|
+
|
|
1022
|
+
// Get the backend sampled logits for the ith token
|
|
1023
|
+
// Returns NULL if no logits were sampled.
|
|
1024
|
+
LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
|
|
1025
|
+
LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
|
|
1026
|
+
|
|
1027
|
+
// Get the backend sampled candidates (token ids) for the ith token
|
|
1028
|
+
// These are needed to map probability/logit indices to vocab token ids.
|
|
1029
|
+
// Returns NULL if no candidates were sampled.
|
|
1030
|
+
LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
|
|
1031
|
+
LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
|
|
1032
|
+
|
|
986
1033
|
//
|
|
987
1034
|
// Vocab
|
|
988
1035
|
//
|
|
@@ -1154,11 +1201,16 @@ extern "C" {
|
|
|
1154
1201
|
//
|
|
1155
1202
|
// llama_sampler_free(smpl);
|
|
1156
1203
|
//
|
|
1157
|
-
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
|
|
1158
|
-
//
|
|
1159
1204
|
|
|
1160
1205
|
typedef void * llama_sampler_context_t;
|
|
1161
1206
|
|
|
1207
|
+
struct llama_sampler_data {
|
|
1208
|
+
struct ggml_tensor * logits;
|
|
1209
|
+
struct ggml_tensor * probs;
|
|
1210
|
+
struct ggml_tensor * sampled;
|
|
1211
|
+
struct ggml_tensor * candidates;
|
|
1212
|
+
};
|
|
1213
|
+
|
|
1162
1214
|
// user code can implement the interface below in order to create custom llama_sampler
|
|
1163
1215
|
struct llama_sampler_i {
|
|
1164
1216
|
const char * (*name) (const struct llama_sampler * smpl); // can be NULL
|
|
@@ -1168,17 +1220,45 @@ extern "C" {
|
|
|
1168
1220
|
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
|
1169
1221
|
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
|
1170
1222
|
|
|
1171
|
-
//
|
|
1172
|
-
//
|
|
1223
|
+
// [EXPERIMENTAL]
|
|
1224
|
+
// backend sampling interface:
|
|
1225
|
+
|
|
1226
|
+
// return true if the backend supports all ops needed by the sampler
|
|
1227
|
+
// note: call once per sampler
|
|
1228
|
+
bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
|
|
1229
|
+
|
|
1230
|
+
// call after .backend_apply()
|
|
1231
|
+
void (*backend_accept)(
|
|
1232
|
+
struct llama_sampler * smpl,
|
|
1233
|
+
struct ggml_context * ctx,
|
|
1234
|
+
struct ggml_cgraph * gf,
|
|
1235
|
+
struct ggml_tensor * selected_token);
|
|
1236
|
+
|
|
1237
|
+
// call after .backend_init()
|
|
1238
|
+
void (*backend_apply)(
|
|
1239
|
+
struct llama_sampler * smpl,
|
|
1240
|
+
struct ggml_context * ctx,
|
|
1241
|
+
struct ggml_cgraph * gf,
|
|
1242
|
+
struct llama_sampler_data * data);
|
|
1243
|
+
|
|
1244
|
+
// called before graph execution to set inputs for the current ubatch
|
|
1245
|
+
void (*backend_set_input)(struct llama_sampler * smpl);
|
|
1173
1246
|
};
|
|
1174
1247
|
|
|
1175
1248
|
struct llama_sampler {
|
|
1176
|
-
|
|
1177
|
-
|
|
1249
|
+
struct llama_sampler_i * iface;
|
|
1250
|
+
|
|
1251
|
+
llama_sampler_context_t ctx;
|
|
1178
1252
|
};
|
|
1179
1253
|
|
|
1254
|
+
// [EXPERIMENTAL]
|
|
1255
|
+
// attach a sampler to the context
|
|
1256
|
+
// note: prefer initializing the context with llama_context_params.samplers when possible
|
|
1257
|
+
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
|
1258
|
+
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
|
1259
|
+
|
|
1180
1260
|
// mirror of llama_sampler_i:
|
|
1181
|
-
LLAMA_API struct llama_sampler * llama_sampler_init (
|
|
1261
|
+
LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
|
|
1182
1262
|
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
|
1183
1263
|
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
|
1184
1264
|
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
|
@@ -1194,7 +1274,15 @@ extern "C" {
|
|
|
1194
1274
|
|
|
1195
1275
|
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called
|
|
1196
1276
|
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
|
|
1197
|
-
|
|
1277
|
+
|
|
1278
|
+
// return NULL if:
|
|
1279
|
+
// - the sampler is NULL
|
|
1280
|
+
// - the sampler is not a llama_sampler_chain
|
|
1281
|
+
// - the index is out of bounds, unless i == -1
|
|
1282
|
+
// - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
|
|
1283
|
+
LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
|
|
1284
|
+
|
|
1285
|
+
// the total number of samplers in the chain
|
|
1198
1286
|
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
|
1199
1287
|
|
|
1200
1288
|
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
|
@@ -87,9 +87,12 @@ add_library(llama
|
|
|
87
87
|
models/llada.cpp
|
|
88
88
|
models/llama-iswa.cpp
|
|
89
89
|
models/llama.cpp
|
|
90
|
+
models/maincoder.cpp
|
|
90
91
|
models/mamba.cpp
|
|
92
|
+
models/mimo2-iswa.cpp
|
|
91
93
|
models/minicpm3.cpp
|
|
92
94
|
models/minimax-m2.cpp
|
|
95
|
+
models/modern-bert.cpp
|
|
93
96
|
models/mpt.cpp
|
|
94
97
|
models/nemotron-h.cpp
|
|
95
98
|
models/nemotron.cpp
|
|
@@ -105,6 +108,7 @@ add_library(llama
|
|
|
105
108
|
models/phi3.cpp
|
|
106
109
|
models/plamo.cpp
|
|
107
110
|
models/plamo2.cpp
|
|
111
|
+
models/plamo3.cpp
|
|
108
112
|
models/plm.cpp
|
|
109
113
|
models/qwen.cpp
|
|
110
114
|
models/qwen2.cpp
|
|
@@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
|
|
146
146
|
return nullptr;
|
|
147
147
|
}
|
|
148
148
|
|
|
149
|
-
static void llama_adapter_lora_init_impl(
|
|
149
|
+
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
|
|
150
150
|
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
|
151
151
|
|
|
152
|
+
llama_model & model = adapter.model;
|
|
153
|
+
|
|
152
154
|
ggml_context * ctx_init;
|
|
153
155
|
gguf_init_params meta_gguf_params = {
|
|
154
156
|
/* .no_alloc = */ true,
|
|
@@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|
|
411
413
|
}
|
|
412
414
|
}
|
|
413
415
|
|
|
416
|
+
// update number of nodes used
|
|
417
|
+
model.n_lora_nodes += adapter.get_n_nodes();
|
|
418
|
+
|
|
414
419
|
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
|
415
420
|
}
|
|
416
421
|
|
|
417
422
|
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
|
418
|
-
llama_adapter_lora * adapter = new llama_adapter_lora();
|
|
423
|
+
llama_adapter_lora * adapter = new llama_adapter_lora(*model);
|
|
419
424
|
|
|
420
425
|
try {
|
|
421
|
-
llama_adapter_lora_init_impl(
|
|
426
|
+
llama_adapter_lora_init_impl(path_lora, *adapter);
|
|
422
427
|
return adapter;
|
|
423
428
|
} catch (const std::exception & err) {
|
|
424
429
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
@@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
|
|
469
474
|
}
|
|
470
475
|
|
|
471
476
|
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
|
477
|
+
// update number of nodes used
|
|
478
|
+
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
|
|
479
|
+
adapter->model.n_lora_nodes -= adapter->get_n_nodes();
|
|
480
|
+
|
|
472
481
|
delete adapter;
|
|
473
482
|
}
|
|
474
483
|
|
|
@@ -59,6 +59,8 @@ struct llama_adapter_lora_weight {
|
|
|
59
59
|
};
|
|
60
60
|
|
|
61
61
|
struct llama_adapter_lora {
|
|
62
|
+
llama_model & model;
|
|
63
|
+
|
|
62
64
|
// map tensor name to lora_a_b
|
|
63
65
|
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
|
64
66
|
|
|
@@ -73,10 +75,14 @@ struct llama_adapter_lora {
|
|
|
73
75
|
// activated lora (aLoRA)
|
|
74
76
|
std::vector<llama_token> alora_invocation_tokens;
|
|
75
77
|
|
|
76
|
-
llama_adapter_lora()
|
|
78
|
+
llama_adapter_lora(llama_model & model) : model(model) {}
|
|
77
79
|
~llama_adapter_lora() = default;
|
|
78
80
|
|
|
79
81
|
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
|
82
|
+
|
|
83
|
+
uint32_t get_n_nodes() const {
|
|
84
|
+
return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
|
|
85
|
+
}
|
|
80
86
|
};
|
|
81
87
|
|
|
82
88
|
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
|
|
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
20
20
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
|
21
21
|
{ LLM_ARCH_REFACT, "refact" },
|
|
22
22
|
{ LLM_ARCH_BERT, "bert" },
|
|
23
|
+
{ LLM_ARCH_MODERN_BERT, "modern-bert" },
|
|
23
24
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
24
25
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
|
25
26
|
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
|
@@ -41,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
41
42
|
{ LLM_ARCH_PHIMOE, "phimoe" },
|
|
42
43
|
{ LLM_ARCH_PLAMO, "plamo" },
|
|
43
44
|
{ LLM_ARCH_PLAMO2, "plamo2" },
|
|
45
|
+
{ LLM_ARCH_PLAMO3, "plamo3" },
|
|
44
46
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
45
47
|
{ LLM_ARCH_ORION, "orion" },
|
|
46
48
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
|
@@ -114,6 +116,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
114
116
|
{ LLM_ARCH_RND1, "rnd1" },
|
|
115
117
|
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
116
118
|
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
|
119
|
+
{ LLM_ARCH_MIMO2, "mimo2" },
|
|
120
|
+
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
|
|
121
|
+
{ LLM_ARCH_MAINCODER, "maincoder" },
|
|
117
122
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
118
123
|
};
|
|
119
124
|
|
|
@@ -147,6 +152,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
147
152
|
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
|
148
153
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
|
149
154
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
|
155
|
+
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
|
|
150
156
|
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
|
151
157
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
|
152
158
|
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
|
@@ -204,6 +210,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
204
210
|
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
|
|
205
211
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
|
206
212
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
|
213
|
+
{ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
|
|
207
214
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
208
215
|
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
|
209
216
|
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
|
@@ -214,6 +221,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
214
221
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
215
222
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
216
223
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
224
|
+
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
|
|
217
225
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
218
226
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
219
227
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
|
@@ -497,6 +505,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
497
505
|
case LLM_ARCH_LLAMA:
|
|
498
506
|
case LLM_ARCH_DECI:
|
|
499
507
|
case LLM_ARCH_MISTRAL3:
|
|
508
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
500
509
|
return {
|
|
501
510
|
LLM_TENSOR_TOKEN_EMBD,
|
|
502
511
|
LLM_TENSOR_OUTPUT_NORM,
|
|
@@ -778,6 +787,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
778
787
|
LLM_TENSOR_CLS,
|
|
779
788
|
LLM_TENSOR_CLS_OUT,
|
|
780
789
|
};
|
|
790
|
+
case LLM_ARCH_MODERN_BERT:
|
|
791
|
+
return {
|
|
792
|
+
LLM_TENSOR_TOKEN_EMBD,
|
|
793
|
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
|
794
|
+
LLM_TENSOR_OUTPUT_NORM,
|
|
795
|
+
LLM_TENSOR_ATTN_NORM,
|
|
796
|
+
LLM_TENSOR_ATTN_OUT,
|
|
797
|
+
LLM_TENSOR_ATTN_QKV,
|
|
798
|
+
LLM_TENSOR_FFN_DOWN,
|
|
799
|
+
LLM_TENSOR_FFN_UP,
|
|
800
|
+
LLM_TENSOR_FFN_NORM,
|
|
801
|
+
LLM_TENSOR_CLS,
|
|
802
|
+
LLM_TENSOR_CLS_OUT,
|
|
803
|
+
};
|
|
781
804
|
case LLM_ARCH_JINA_BERT_V2:
|
|
782
805
|
return {
|
|
783
806
|
LLM_TENSOR_TOKEN_EMBD,
|
|
@@ -1057,6 +1080,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
1057
1080
|
LLM_TENSOR_ATTN_POST_NORM,
|
|
1058
1081
|
LLM_TENSOR_FFN_POST_NORM,
|
|
1059
1082
|
};
|
|
1083
|
+
case LLM_ARCH_PLAMO3:
|
|
1084
|
+
return {
|
|
1085
|
+
LLM_TENSOR_TOKEN_EMBD,
|
|
1086
|
+
LLM_TENSOR_OUTPUT_NORM,
|
|
1087
|
+
LLM_TENSOR_OUTPUT,
|
|
1088
|
+
LLM_TENSOR_ATTN_NORM,
|
|
1089
|
+
LLM_TENSOR_ATTN_QKV,
|
|
1090
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
|
1091
|
+
LLM_TENSOR_ATTN_K_NORM,
|
|
1092
|
+
LLM_TENSOR_ATTN_OUT,
|
|
1093
|
+
LLM_TENSOR_ATTN_POST_NORM,
|
|
1094
|
+
LLM_TENSOR_FFN_NORM,
|
|
1095
|
+
LLM_TENSOR_FFN_POST_NORM,
|
|
1096
|
+
LLM_TENSOR_FFN_DOWN,
|
|
1097
|
+
LLM_TENSOR_FFN_UP,
|
|
1098
|
+
};
|
|
1060
1099
|
case LLM_ARCH_CODESHELL:
|
|
1061
1100
|
return {
|
|
1062
1101
|
LLM_TENSOR_TOKEN_EMBD,
|
|
@@ -2037,6 +2076,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
2037
2076
|
LLM_TENSOR_TOKEN_EMBD,
|
|
2038
2077
|
LLM_TENSOR_OUTPUT_NORM_LFM2,
|
|
2039
2078
|
LLM_TENSOR_OUTPUT,
|
|
2079
|
+
LLM_TENSOR_DENSE_2_OUT,
|
|
2040
2080
|
};
|
|
2041
2081
|
case LLM_ARCH_LFM2MOE:
|
|
2042
2082
|
return {
|
|
@@ -2171,11 +2211,49 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
2171
2211
|
LLM_TENSOR_VISEXP_FFN_DOWN,
|
|
2172
2212
|
LLM_TENSOR_VISEXP_FFN_UP,
|
|
2173
2213
|
};
|
|
2214
|
+
case LLM_ARCH_MIMO2:
|
|
2215
|
+
return {
|
|
2216
|
+
LLM_TENSOR_TOKEN_EMBD,
|
|
2217
|
+
LLM_TENSOR_OUTPUT_NORM,
|
|
2218
|
+
LLM_TENSOR_OUTPUT,
|
|
2219
|
+
LLM_TENSOR_ATTN_NORM,
|
|
2220
|
+
LLM_TENSOR_ATTN_Q,
|
|
2221
|
+
LLM_TENSOR_ATTN_K,
|
|
2222
|
+
LLM_TENSOR_ATTN_V,
|
|
2223
|
+
LLM_TENSOR_ATTN_SINKS,
|
|
2224
|
+
LLM_TENSOR_ATTN_OUT,
|
|
2225
|
+
LLM_TENSOR_FFN_NORM,
|
|
2226
|
+
LLM_TENSOR_FFN_GATE,
|
|
2227
|
+
LLM_TENSOR_FFN_DOWN,
|
|
2228
|
+
LLM_TENSOR_FFN_UP,
|
|
2229
|
+
LLM_TENSOR_FFN_GATE_INP,
|
|
2230
|
+
LLM_TENSOR_FFN_GATE_EXPS,
|
|
2231
|
+
LLM_TENSOR_FFN_DOWN_EXPS,
|
|
2232
|
+
LLM_TENSOR_FFN_UP_EXPS,
|
|
2233
|
+
LLM_TENSOR_FFN_EXP_PROBS_B,
|
|
2234
|
+
};
|
|
2174
2235
|
case LLM_ARCH_GPTJ:
|
|
2175
2236
|
case LLM_ARCH_UNKNOWN:
|
|
2176
2237
|
return {
|
|
2177
2238
|
LLM_TENSOR_TOKEN_EMBD,
|
|
2178
2239
|
};
|
|
2240
|
+
case LLM_ARCH_MAINCODER:
|
|
2241
|
+
return {
|
|
2242
|
+
LLM_TENSOR_TOKEN_EMBD,
|
|
2243
|
+
LLM_TENSOR_OUTPUT_NORM,
|
|
2244
|
+
LLM_TENSOR_OUTPUT,
|
|
2245
|
+
LLM_TENSOR_ATTN_NORM,
|
|
2246
|
+
LLM_TENSOR_ATTN_Q,
|
|
2247
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
|
2248
|
+
LLM_TENSOR_ATTN_K,
|
|
2249
|
+
LLM_TENSOR_ATTN_K_NORM,
|
|
2250
|
+
LLM_TENSOR_ATTN_V,
|
|
2251
|
+
LLM_TENSOR_ATTN_OUT,
|
|
2252
|
+
LLM_TENSOR_FFN_NORM,
|
|
2253
|
+
LLM_TENSOR_FFN_GATE,
|
|
2254
|
+
LLM_TENSOR_FFN_DOWN,
|
|
2255
|
+
LLM_TENSOR_FFN_UP,
|
|
2256
|
+
};
|
|
2179
2257
|
default:
|
|
2180
2258
|
GGML_ABORT("unknown architecture for tensor mapping");
|
|
2181
2259
|
}
|
|
@@ -24,6 +24,7 @@ enum llm_arch {
|
|
|
24
24
|
LLM_ARCH_STARCODER,
|
|
25
25
|
LLM_ARCH_REFACT,
|
|
26
26
|
LLM_ARCH_BERT,
|
|
27
|
+
LLM_ARCH_MODERN_BERT,
|
|
27
28
|
LLM_ARCH_NOMIC_BERT,
|
|
28
29
|
LLM_ARCH_NOMIC_BERT_MOE,
|
|
29
30
|
LLM_ARCH_NEO_BERT,
|
|
@@ -45,6 +46,7 @@ enum llm_arch {
|
|
|
45
46
|
LLM_ARCH_PHIMOE,
|
|
46
47
|
LLM_ARCH_PLAMO,
|
|
47
48
|
LLM_ARCH_PLAMO2,
|
|
49
|
+
LLM_ARCH_PLAMO3,
|
|
48
50
|
LLM_ARCH_CODESHELL,
|
|
49
51
|
LLM_ARCH_ORION,
|
|
50
52
|
LLM_ARCH_INTERNLM2,
|
|
@@ -118,6 +120,9 @@ enum llm_arch {
|
|
|
118
120
|
LLM_ARCH_RND1,
|
|
119
121
|
LLM_ARCH_PANGU_EMBED,
|
|
120
122
|
LLM_ARCH_MISTRAL3,
|
|
123
|
+
LLM_ARCH_MIMO2,
|
|
124
|
+
LLM_ARCH_LLAMA_EMBED,
|
|
125
|
+
LLM_ARCH_MAINCODER,
|
|
121
126
|
LLM_ARCH_UNKNOWN,
|
|
122
127
|
};
|
|
123
128
|
|
|
@@ -151,6 +156,7 @@ enum llm_kv {
|
|
|
151
156
|
LLM_KV_VOCAB_SIZE,
|
|
152
157
|
LLM_KV_CONTEXT_LENGTH,
|
|
153
158
|
LLM_KV_EMBEDDING_LENGTH,
|
|
159
|
+
LLM_KV_EMBEDDING_LENGTH_OUT,
|
|
154
160
|
LLM_KV_FEATURES_LENGTH,
|
|
155
161
|
LLM_KV_BLOCK_COUNT,
|
|
156
162
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
|
@@ -208,6 +214,7 @@ enum llm_kv {
|
|
|
208
214
|
LLM_KV_ATTENTION_GATE_LORA_RANK,
|
|
209
215
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
|
210
216
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
217
|
+
LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
|
|
211
218
|
LLM_KV_ATTENTION_SCALE,
|
|
212
219
|
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
|
213
220
|
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
|
@@ -218,6 +225,7 @@ enum llm_kv {
|
|
|
218
225
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
219
226
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
220
227
|
LLM_KV_ROPE_FREQ_BASE,
|
|
228
|
+
LLM_KV_ROPE_FREQ_BASE_SWA,
|
|
221
229
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
222
230
|
LLM_KV_ROPE_SCALING_TYPE,
|
|
223
231
|
LLM_KV_ROPE_SCALING_FACTOR,
|
|
@@ -74,6 +74,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
74
74
|
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
|
75
75
|
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
|
76
76
|
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
|
|
77
|
+
{ "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
|
|
77
78
|
};
|
|
78
79
|
|
|
79
80
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
216
217
|
return LLM_CHAT_TEMPLATE_GROK_2;
|
|
217
218
|
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
|
|
218
219
|
return LLM_CHAT_TEMPLATE_PANGU_EMBED;
|
|
220
|
+
} else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
|
|
221
|
+
return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
|
|
219
222
|
}
|
|
220
223
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
221
224
|
}
|
|
@@ -845,6 +848,14 @@ int32_t llm_chat_apply_template(
|
|
|
845
848
|
if (add_ass) {
|
|
846
849
|
ss << "[unused9]助手:";
|
|
847
850
|
}
|
|
851
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
|
|
852
|
+
for (auto message : chat) {
|
|
853
|
+
std::string role(message->role);
|
|
854
|
+
ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
|
|
855
|
+
}
|
|
856
|
+
if (add_ass) {
|
|
857
|
+
ss << "<|begin|>assistant";
|
|
858
|
+
}
|
|
848
859
|
} else {
|
|
849
860
|
// template not supported
|
|
850
861
|
return -1;
|