@fugood/llama.node 1.4.11 → 1.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +30 -30
- package/src/llama.cpp/common/arg.cpp +29 -14
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +32 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +23 -23
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +13 -4
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +76 -0
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +22 -21
- package/src/llama.cpp/src/llama-hparams.h +4 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +287 -16
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +44 -33
- package/src/llama.cpp/src/llama-sampling.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +52 -37
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -286,7 +286,7 @@ extern "C" {
|
|
|
286
286
|
// NULL-terminated list of buffer types to use for tensors that match a pattern
|
|
287
287
|
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
|
|
288
288
|
|
|
289
|
-
int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
289
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
|
|
290
290
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
|
291
291
|
|
|
292
292
|
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
|
@@ -467,10 +467,17 @@ extern "C" {
|
|
|
467
467
|
// Frees all allocated memory
|
|
468
468
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
|
469
469
|
|
|
470
|
+
enum llama_params_fit_status {
|
|
471
|
+
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
|
472
|
+
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
|
|
473
|
+
LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
|
|
474
|
+
};
|
|
475
|
+
|
|
470
476
|
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
|
471
|
-
// returns true if the parameters could be successfully modified to fit device memory
|
|
472
|
-
// this function is NOT thread safe because it modifies the global llama logger state
|
|
473
|
-
|
|
477
|
+
// - returns true if the parameters could be successfully modified to fit device memory
|
|
478
|
+
// - this function is NOT thread safe because it modifies the global llama logger state
|
|
479
|
+
// - only parameters that have the same value as in llama_default_model_params are modified
|
|
480
|
+
LLAMA_API enum llama_params_fit_status llama_params_fit(
|
|
474
481
|
const char * path_model,
|
|
475
482
|
struct llama_model_params * mparams,
|
|
476
483
|
struct llama_context_params * cparams,
|
|
@@ -600,6 +607,8 @@ extern "C" {
|
|
|
600
607
|
//
|
|
601
608
|
|
|
602
609
|
// Load a LoRA adapter from file
|
|
610
|
+
// The adapter is valid as long as the associated model is not freed
|
|
611
|
+
// All adapters must be loaded before context creation
|
|
603
612
|
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
|
604
613
|
struct llama_model * model,
|
|
605
614
|
const char * path_lora);
|
|
@@ -87,9 +87,12 @@ add_library(llama
|
|
|
87
87
|
models/llada.cpp
|
|
88
88
|
models/llama-iswa.cpp
|
|
89
89
|
models/llama.cpp
|
|
90
|
+
models/maincoder.cpp
|
|
90
91
|
models/mamba.cpp
|
|
92
|
+
models/mimo2-iswa.cpp
|
|
91
93
|
models/minicpm3.cpp
|
|
92
94
|
models/minimax-m2.cpp
|
|
95
|
+
models/modern-bert.cpp
|
|
93
96
|
models/mpt.cpp
|
|
94
97
|
models/nemotron-h.cpp
|
|
95
98
|
models/nemotron.cpp
|
|
@@ -105,6 +108,7 @@ add_library(llama
|
|
|
105
108
|
models/phi3.cpp
|
|
106
109
|
models/plamo.cpp
|
|
107
110
|
models/plamo2.cpp
|
|
111
|
+
models/plamo3.cpp
|
|
108
112
|
models/plm.cpp
|
|
109
113
|
models/qwen.cpp
|
|
110
114
|
models/qwen2.cpp
|
|
@@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
|
|
146
146
|
return nullptr;
|
|
147
147
|
}
|
|
148
148
|
|
|
149
|
-
static void llama_adapter_lora_init_impl(
|
|
149
|
+
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
|
|
150
150
|
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
|
151
151
|
|
|
152
|
+
llama_model & model = adapter.model;
|
|
153
|
+
|
|
152
154
|
ggml_context * ctx_init;
|
|
153
155
|
gguf_init_params meta_gguf_params = {
|
|
154
156
|
/* .no_alloc = */ true,
|
|
@@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|
|
411
413
|
}
|
|
412
414
|
}
|
|
413
415
|
|
|
416
|
+
// update number of nodes used
|
|
417
|
+
model.n_lora_nodes += adapter.get_n_nodes();
|
|
418
|
+
|
|
414
419
|
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
|
415
420
|
}
|
|
416
421
|
|
|
417
422
|
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
|
418
|
-
llama_adapter_lora * adapter = new llama_adapter_lora();
|
|
423
|
+
llama_adapter_lora * adapter = new llama_adapter_lora(*model);
|
|
419
424
|
|
|
420
425
|
try {
|
|
421
|
-
llama_adapter_lora_init_impl(
|
|
426
|
+
llama_adapter_lora_init_impl(path_lora, *adapter);
|
|
422
427
|
return adapter;
|
|
423
428
|
} catch (const std::exception & err) {
|
|
424
429
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
@@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
|
|
469
474
|
}
|
|
470
475
|
|
|
471
476
|
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
|
477
|
+
// update number of nodes used
|
|
478
|
+
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
|
|
479
|
+
adapter->model.n_lora_nodes -= adapter->get_n_nodes();
|
|
480
|
+
|
|
472
481
|
delete adapter;
|
|
473
482
|
}
|
|
474
483
|
|
|
@@ -59,6 +59,8 @@ struct llama_adapter_lora_weight {
|
|
|
59
59
|
};
|
|
60
60
|
|
|
61
61
|
struct llama_adapter_lora {
|
|
62
|
+
llama_model & model;
|
|
63
|
+
|
|
62
64
|
// map tensor name to lora_a_b
|
|
63
65
|
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
|
64
66
|
|
|
@@ -73,10 +75,14 @@ struct llama_adapter_lora {
|
|
|
73
75
|
// activated lora (aLoRA)
|
|
74
76
|
std::vector<llama_token> alora_invocation_tokens;
|
|
75
77
|
|
|
76
|
-
llama_adapter_lora()
|
|
78
|
+
llama_adapter_lora(llama_model & model) : model(model) {}
|
|
77
79
|
~llama_adapter_lora() = default;
|
|
78
80
|
|
|
79
81
|
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
|
82
|
+
|
|
83
|
+
uint32_t get_n_nodes() const {
|
|
84
|
+
return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
|
|
85
|
+
}
|
|
80
86
|
};
|
|
81
87
|
|
|
82
88
|
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
|
|
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
20
20
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
|
21
21
|
{ LLM_ARCH_REFACT, "refact" },
|
|
22
22
|
{ LLM_ARCH_BERT, "bert" },
|
|
23
|
+
{ LLM_ARCH_MODERN_BERT, "modern-bert" },
|
|
23
24
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
24
25
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
|
25
26
|
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
|
@@ -41,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
41
42
|
{ LLM_ARCH_PHIMOE, "phimoe" },
|
|
42
43
|
{ LLM_ARCH_PLAMO, "plamo" },
|
|
43
44
|
{ LLM_ARCH_PLAMO2, "plamo2" },
|
|
45
|
+
{ LLM_ARCH_PLAMO3, "plamo3" },
|
|
44
46
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
45
47
|
{ LLM_ARCH_ORION, "orion" },
|
|
46
48
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
|
@@ -114,6 +116,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
114
116
|
{ LLM_ARCH_RND1, "rnd1" },
|
|
115
117
|
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
116
118
|
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
|
119
|
+
{ LLM_ARCH_MIMO2, "mimo2" },
|
|
120
|
+
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
|
|
121
|
+
{ LLM_ARCH_MAINCODER, "maincoder" },
|
|
117
122
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
118
123
|
};
|
|
119
124
|
|
|
@@ -204,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
204
209
|
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
|
|
205
210
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
|
206
211
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
|
212
|
+
{ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
|
|
207
213
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
208
214
|
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
|
209
215
|
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
|
@@ -214,6 +220,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
214
220
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
215
221
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
216
222
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
223
|
+
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
|
|
217
224
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
218
225
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
219
226
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
|
@@ -497,6 +504,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
497
504
|
case LLM_ARCH_LLAMA:
|
|
498
505
|
case LLM_ARCH_DECI:
|
|
499
506
|
case LLM_ARCH_MISTRAL3:
|
|
507
|
+
case LLM_ARCH_LLAMA_EMBED:
|
|
500
508
|
return {
|
|
501
509
|
LLM_TENSOR_TOKEN_EMBD,
|
|
502
510
|
LLM_TENSOR_OUTPUT_NORM,
|
|
@@ -778,6 +786,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
778
786
|
LLM_TENSOR_CLS,
|
|
779
787
|
LLM_TENSOR_CLS_OUT,
|
|
780
788
|
};
|
|
789
|
+
case LLM_ARCH_MODERN_BERT:
|
|
790
|
+
return {
|
|
791
|
+
LLM_TENSOR_TOKEN_EMBD,
|
|
792
|
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
|
793
|
+
LLM_TENSOR_OUTPUT_NORM,
|
|
794
|
+
LLM_TENSOR_ATTN_NORM,
|
|
795
|
+
LLM_TENSOR_ATTN_OUT,
|
|
796
|
+
LLM_TENSOR_ATTN_QKV,
|
|
797
|
+
LLM_TENSOR_FFN_DOWN,
|
|
798
|
+
LLM_TENSOR_FFN_UP,
|
|
799
|
+
LLM_TENSOR_FFN_NORM,
|
|
800
|
+
LLM_TENSOR_CLS,
|
|
801
|
+
LLM_TENSOR_CLS_OUT,
|
|
802
|
+
};
|
|
781
803
|
case LLM_ARCH_JINA_BERT_V2:
|
|
782
804
|
return {
|
|
783
805
|
LLM_TENSOR_TOKEN_EMBD,
|
|
@@ -1057,6 +1079,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
1057
1079
|
LLM_TENSOR_ATTN_POST_NORM,
|
|
1058
1080
|
LLM_TENSOR_FFN_POST_NORM,
|
|
1059
1081
|
};
|
|
1082
|
+
case LLM_ARCH_PLAMO3:
|
|
1083
|
+
return {
|
|
1084
|
+
LLM_TENSOR_TOKEN_EMBD,
|
|
1085
|
+
LLM_TENSOR_OUTPUT_NORM,
|
|
1086
|
+
LLM_TENSOR_OUTPUT,
|
|
1087
|
+
LLM_TENSOR_ATTN_NORM,
|
|
1088
|
+
LLM_TENSOR_ATTN_QKV,
|
|
1089
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
|
1090
|
+
LLM_TENSOR_ATTN_K_NORM,
|
|
1091
|
+
LLM_TENSOR_ATTN_OUT,
|
|
1092
|
+
LLM_TENSOR_ATTN_POST_NORM,
|
|
1093
|
+
LLM_TENSOR_FFN_NORM,
|
|
1094
|
+
LLM_TENSOR_FFN_POST_NORM,
|
|
1095
|
+
LLM_TENSOR_FFN_DOWN,
|
|
1096
|
+
LLM_TENSOR_FFN_UP,
|
|
1097
|
+
};
|
|
1060
1098
|
case LLM_ARCH_CODESHELL:
|
|
1061
1099
|
return {
|
|
1062
1100
|
LLM_TENSOR_TOKEN_EMBD,
|
|
@@ -2171,11 +2209,49 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
2171
2209
|
LLM_TENSOR_VISEXP_FFN_DOWN,
|
|
2172
2210
|
LLM_TENSOR_VISEXP_FFN_UP,
|
|
2173
2211
|
};
|
|
2212
|
+
case LLM_ARCH_MIMO2:
|
|
2213
|
+
return {
|
|
2214
|
+
LLM_TENSOR_TOKEN_EMBD,
|
|
2215
|
+
LLM_TENSOR_OUTPUT_NORM,
|
|
2216
|
+
LLM_TENSOR_OUTPUT,
|
|
2217
|
+
LLM_TENSOR_ATTN_NORM,
|
|
2218
|
+
LLM_TENSOR_ATTN_Q,
|
|
2219
|
+
LLM_TENSOR_ATTN_K,
|
|
2220
|
+
LLM_TENSOR_ATTN_V,
|
|
2221
|
+
LLM_TENSOR_ATTN_SINKS,
|
|
2222
|
+
LLM_TENSOR_ATTN_OUT,
|
|
2223
|
+
LLM_TENSOR_FFN_NORM,
|
|
2224
|
+
LLM_TENSOR_FFN_GATE,
|
|
2225
|
+
LLM_TENSOR_FFN_DOWN,
|
|
2226
|
+
LLM_TENSOR_FFN_UP,
|
|
2227
|
+
LLM_TENSOR_FFN_GATE_INP,
|
|
2228
|
+
LLM_TENSOR_FFN_GATE_EXPS,
|
|
2229
|
+
LLM_TENSOR_FFN_DOWN_EXPS,
|
|
2230
|
+
LLM_TENSOR_FFN_UP_EXPS,
|
|
2231
|
+
LLM_TENSOR_FFN_EXP_PROBS_B,
|
|
2232
|
+
};
|
|
2174
2233
|
case LLM_ARCH_GPTJ:
|
|
2175
2234
|
case LLM_ARCH_UNKNOWN:
|
|
2176
2235
|
return {
|
|
2177
2236
|
LLM_TENSOR_TOKEN_EMBD,
|
|
2178
2237
|
};
|
|
2238
|
+
case LLM_ARCH_MAINCODER:
|
|
2239
|
+
return {
|
|
2240
|
+
LLM_TENSOR_TOKEN_EMBD,
|
|
2241
|
+
LLM_TENSOR_OUTPUT_NORM,
|
|
2242
|
+
LLM_TENSOR_OUTPUT,
|
|
2243
|
+
LLM_TENSOR_ATTN_NORM,
|
|
2244
|
+
LLM_TENSOR_ATTN_Q,
|
|
2245
|
+
LLM_TENSOR_ATTN_Q_NORM,
|
|
2246
|
+
LLM_TENSOR_ATTN_K,
|
|
2247
|
+
LLM_TENSOR_ATTN_K_NORM,
|
|
2248
|
+
LLM_TENSOR_ATTN_V,
|
|
2249
|
+
LLM_TENSOR_ATTN_OUT,
|
|
2250
|
+
LLM_TENSOR_FFN_NORM,
|
|
2251
|
+
LLM_TENSOR_FFN_GATE,
|
|
2252
|
+
LLM_TENSOR_FFN_DOWN,
|
|
2253
|
+
LLM_TENSOR_FFN_UP,
|
|
2254
|
+
};
|
|
2179
2255
|
default:
|
|
2180
2256
|
GGML_ABORT("unknown architecture for tensor mapping");
|
|
2181
2257
|
}
|
|
@@ -24,6 +24,7 @@ enum llm_arch {
|
|
|
24
24
|
LLM_ARCH_STARCODER,
|
|
25
25
|
LLM_ARCH_REFACT,
|
|
26
26
|
LLM_ARCH_BERT,
|
|
27
|
+
LLM_ARCH_MODERN_BERT,
|
|
27
28
|
LLM_ARCH_NOMIC_BERT,
|
|
28
29
|
LLM_ARCH_NOMIC_BERT_MOE,
|
|
29
30
|
LLM_ARCH_NEO_BERT,
|
|
@@ -45,6 +46,7 @@ enum llm_arch {
|
|
|
45
46
|
LLM_ARCH_PHIMOE,
|
|
46
47
|
LLM_ARCH_PLAMO,
|
|
47
48
|
LLM_ARCH_PLAMO2,
|
|
49
|
+
LLM_ARCH_PLAMO3,
|
|
48
50
|
LLM_ARCH_CODESHELL,
|
|
49
51
|
LLM_ARCH_ORION,
|
|
50
52
|
LLM_ARCH_INTERNLM2,
|
|
@@ -118,6 +120,9 @@ enum llm_arch {
|
|
|
118
120
|
LLM_ARCH_RND1,
|
|
119
121
|
LLM_ARCH_PANGU_EMBED,
|
|
120
122
|
LLM_ARCH_MISTRAL3,
|
|
123
|
+
LLM_ARCH_MIMO2,
|
|
124
|
+
LLM_ARCH_LLAMA_EMBED,
|
|
125
|
+
LLM_ARCH_MAINCODER,
|
|
121
126
|
LLM_ARCH_UNKNOWN,
|
|
122
127
|
};
|
|
123
128
|
|
|
@@ -208,6 +213,7 @@ enum llm_kv {
|
|
|
208
213
|
LLM_KV_ATTENTION_GATE_LORA_RANK,
|
|
209
214
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
|
210
215
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
216
|
+
LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
|
|
211
217
|
LLM_KV_ATTENTION_SCALE,
|
|
212
218
|
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
|
213
219
|
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
|
@@ -218,6 +224,7 @@ enum llm_kv {
|
|
|
218
224
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
219
225
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
220
226
|
LLM_KV_ROPE_FREQ_BASE,
|
|
227
|
+
LLM_KV_ROPE_FREQ_BASE_SWA,
|
|
221
228
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
222
229
|
LLM_KV_ROPE_SCALING_TYPE,
|
|
223
230
|
LLM_KV_ROPE_SCALING_FACTOR,
|
|
@@ -74,6 +74,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
74
74
|
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
|
75
75
|
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
|
76
76
|
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
|
|
77
|
+
{ "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
|
|
77
78
|
};
|
|
78
79
|
|
|
79
80
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
216
217
|
return LLM_CHAT_TEMPLATE_GROK_2;
|
|
217
218
|
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
|
|
218
219
|
return LLM_CHAT_TEMPLATE_PANGU_EMBED;
|
|
220
|
+
} else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
|
|
221
|
+
return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
|
|
219
222
|
}
|
|
220
223
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
221
224
|
}
|
|
@@ -845,6 +848,14 @@ int32_t llm_chat_apply_template(
|
|
|
845
848
|
if (add_ass) {
|
|
846
849
|
ss << "[unused9]助手:";
|
|
847
850
|
}
|
|
851
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
|
|
852
|
+
for (auto message : chat) {
|
|
853
|
+
std::string role(message->role);
|
|
854
|
+
ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
|
|
855
|
+
}
|
|
856
|
+
if (add_ass) {
|
|
857
|
+
ss << "<|begin|>assistant";
|
|
858
|
+
}
|
|
848
859
|
} else {
|
|
849
860
|
// template not supported
|
|
850
861
|
return -1;
|
|
@@ -294,8 +294,8 @@ llama_context::llama_context(
|
|
|
294
294
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
|
295
295
|
bool pipeline_parallel =
|
|
296
296
|
model.n_devices() > 1 &&
|
|
297
|
-
model.
|
|
298
|
-
model.
|
|
297
|
+
model.n_gpu_layers() > model.hparams.n_layer &&
|
|
298
|
+
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
|
|
299
299
|
cparams.offload_kqv &&
|
|
300
300
|
!model.has_tensor_overrides();
|
|
301
301
|
|
|
@@ -459,23 +459,22 @@ llama_context::llama_context(
|
|
|
459
459
|
}
|
|
460
460
|
|
|
461
461
|
llama_context::~llama_context() {
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
// }
|
|
462
|
+
if (!model.hparams.no_alloc) {
|
|
463
|
+
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
|
464
|
+
ggml_backend_t backend = backend_ptrs[i];
|
|
465
|
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
|
466
|
+
|
|
467
|
+
const size_t size_exp = backend_buf_exp_size[i];
|
|
468
|
+
const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
469
|
+
if (size_exp == size_act) {
|
|
470
|
+
LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
|
471
|
+
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
472
|
+
} else {
|
|
473
|
+
LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
|
474
|
+
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
}
|
|
479
478
|
ggml_opt_free(opt_ctx);
|
|
480
479
|
}
|
|
481
480
|
|
|
@@ -1443,7 +1442,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
|
|
1443
1442
|
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
|
1444
1443
|
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
|
1445
1444
|
}
|
|
1446
|
-
|
|
1445
|
+
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
1446
|
+
res += model.n_lora_nodes;
|
|
1447
|
+
return res;
|
|
1447
1448
|
}
|
|
1448
1449
|
|
|
1449
1450
|
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|
@@ -1571,7 +1572,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
|
|
|
1571
1572
|
|
|
1572
1573
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
|
1573
1574
|
// FIXME: fix in ggml_backend_sched
|
|
1574
|
-
const bool full_offload = model.
|
|
1575
|
+
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
|
|
1575
1576
|
if (ubatch.n_tokens < 32 || full_offload) {
|
|
1576
1577
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
|
1577
1578
|
const auto & dev_layer = model.dev_layer(il);
|
|
@@ -123,10 +123,11 @@ struct llama_hparams {
|
|
|
123
123
|
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
|
124
124
|
// the size of the sliding window (0 - no SWA)
|
|
125
125
|
uint32_t n_swa = 0;
|
|
126
|
-
// if swa_layers[il] ==
|
|
127
|
-
// if swa_layers[il] ==
|
|
126
|
+
// if swa_layers[il] == 1, then layer il is SWA
|
|
127
|
+
// if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
|
|
128
128
|
// by default, all layers are dense
|
|
129
|
-
|
|
129
|
+
// note: using uint32_t type for compatibility reason
|
|
130
|
+
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
|
|
130
131
|
|
|
131
132
|
// for State Space Models
|
|
132
133
|
uint32_t ssm_d_conv = 0;
|
|
@@ -305,7 +305,7 @@ public:
|
|
|
305
305
|
bool do_shift,
|
|
306
306
|
stream_copy_info sc_info);
|
|
307
307
|
|
|
308
|
-
// used to create a batch
|
|
308
|
+
// used to create a batch processing context from a batch
|
|
309
309
|
llama_kv_cache_context(
|
|
310
310
|
llama_kv_cache * kv,
|
|
311
311
|
slot_info_vec_t sinfos,
|
|
@@ -240,9 +240,10 @@ struct llama_file::impl {
|
|
|
240
240
|
throw std::runtime_error("unexpectedly reached end of file");
|
|
241
241
|
}
|
|
242
242
|
} else {
|
|
243
|
-
|
|
244
|
-
while (
|
|
245
|
-
|
|
243
|
+
size_t bytes_read = 0;
|
|
244
|
+
while (bytes_read < len) {
|
|
245
|
+
const size_t to_read = len - bytes_read;
|
|
246
|
+
ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
|
|
246
247
|
|
|
247
248
|
if (ret == -1) {
|
|
248
249
|
if (errno == EINTR) {
|
|
@@ -251,10 +252,16 @@ struct llama_file::impl {
|
|
|
251
252
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
252
253
|
}
|
|
253
254
|
if (ret == 0) {
|
|
255
|
+
// EOF: allow if this read was only pulling alignment padding past file end
|
|
256
|
+
off_t pos = lseek(fd, 0, SEEK_CUR);
|
|
257
|
+
if (pos != -1 && (size_t) pos == size) {
|
|
258
|
+
std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
254
261
|
throw std::runtime_error("unexpectedly reached end of file");
|
|
255
262
|
}
|
|
256
263
|
|
|
257
|
-
|
|
264
|
+
bytes_read += (size_t) ret;
|
|
258
265
|
}
|
|
259
266
|
}
|
|
260
267
|
}
|
|
@@ -462,6 +462,29 @@ namespace GGUFMeta {
|
|
|
462
462
|
return get_key_or_arr(llm_kv(kid), result, n, required);
|
|
463
463
|
}
|
|
464
464
|
|
|
465
|
+
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
|
|
466
|
+
const std::string key = llm_kv(kid);
|
|
467
|
+
|
|
468
|
+
const int id = gguf_find_key(meta.get(), key.c_str());
|
|
469
|
+
|
|
470
|
+
if (id < 0) {
|
|
471
|
+
if (required) {
|
|
472
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
|
473
|
+
}
|
|
474
|
+
return false;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// throw and error if type is an array
|
|
478
|
+
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
|
|
479
|
+
if (required) {
|
|
480
|
+
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
|
|
481
|
+
}
|
|
482
|
+
return false;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
return get_key(key, result, required);
|
|
486
|
+
}
|
|
487
|
+
|
|
465
488
|
// TODO: this is not very clever - figure out something better
|
|
466
489
|
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
|
467
490
|
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
|
@@ -131,6 +131,8 @@ struct llama_model_loader {
|
|
|
131
131
|
template<typename T>
|
|
132
132
|
bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
|
|
133
133
|
|
|
134
|
+
bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
|
|
135
|
+
|
|
134
136
|
std::string get_arch_name() const;
|
|
135
137
|
|
|
136
138
|
enum llm_arch get_arch() const;
|