@fugood/llama.node 1.4.6 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -26
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +272 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +680 -47
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +99 -29
- package/src/llama.cpp/src/llama-context.h +9 -3
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +85 -17
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +123 -52
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +8 -7
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -1013,31 +1013,40 @@ bool tty_can_use_colors() {
|
|
|
1013
1013
|
// Model utils
|
|
1014
1014
|
//
|
|
1015
1015
|
|
|
1016
|
-
|
|
1016
|
+
// TODO: move to common/sampling
|
|
1017
|
+
static void common_init_sampler_from_model(
|
|
1017
1018
|
const llama_model * model,
|
|
1018
1019
|
common_params_sampling & sparams) {
|
|
1019
1020
|
|
|
1020
1021
|
const uint64_t config = sparams.user_sampling_config;
|
|
1021
1022
|
|
|
1022
1023
|
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
|
1023
|
-
if (config & user_config)
|
|
1024
|
+
if (config & user_config) {
|
|
1025
|
+
return;
|
|
1026
|
+
}
|
|
1024
1027
|
|
|
1025
1028
|
char buf[64] = {0};
|
|
1026
1029
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
|
1027
1030
|
char * end = nullptr;
|
|
1028
1031
|
int32_t v = strtol(buf, &end, 10);
|
|
1029
|
-
if (end && end != buf)
|
|
1032
|
+
if (end && end != buf) {
|
|
1033
|
+
dst = v;
|
|
1034
|
+
}
|
|
1030
1035
|
}
|
|
1031
1036
|
};
|
|
1032
1037
|
|
|
1033
1038
|
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
|
1034
|
-
if (config & user_config)
|
|
1039
|
+
if (config & user_config) {
|
|
1040
|
+
return;
|
|
1041
|
+
}
|
|
1035
1042
|
|
|
1036
1043
|
char buf[128] = {0};
|
|
1037
1044
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
|
1038
1045
|
char * end = nullptr;
|
|
1039
1046
|
float v = strtof(buf, &end);
|
|
1040
|
-
if (end && end != buf)
|
|
1047
|
+
if (end && end != buf) {
|
|
1048
|
+
dst = v;
|
|
1049
|
+
}
|
|
1041
1050
|
}
|
|
1042
1051
|
};
|
|
1043
1052
|
|
|
@@ -1065,31 +1074,125 @@ static inline void common_init_sampler_from_model(
|
|
|
1065
1074
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
|
1066
1075
|
}
|
|
1067
1076
|
|
|
1068
|
-
struct common_init_result
|
|
1069
|
-
|
|
1077
|
+
struct common_init_result::impl {
|
|
1078
|
+
impl() = default;
|
|
1079
|
+
~impl() = default;
|
|
1080
|
+
|
|
1081
|
+
llama_model_ptr model;
|
|
1082
|
+
llama_context_ptr context;
|
|
1083
|
+
|
|
1084
|
+
std::vector<llama_adapter_lora_ptr> lora;
|
|
1085
|
+
|
|
1086
|
+
std::vector<common_sampler_ptr> samplers;
|
|
1087
|
+
};
|
|
1088
|
+
|
|
1089
|
+
common_init_result::common_init_result(common_params & params) :
|
|
1090
|
+
pimpl(new impl{}) {
|
|
1070
1091
|
auto mparams = common_model_params_to_llama(params);
|
|
1092
|
+
auto cparams = common_context_params_to_llama(params);
|
|
1093
|
+
|
|
1094
|
+
if (params.fit_params) {
|
|
1095
|
+
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
|
1096
|
+
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
|
1097
|
+
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
|
1098
|
+
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
|
1099
|
+
}
|
|
1071
1100
|
|
|
1072
1101
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
1073
1102
|
if (model == NULL) {
|
|
1074
|
-
|
|
1075
|
-
__func__, params.model.path.c_str());
|
|
1076
|
-
return iparams;
|
|
1103
|
+
return;
|
|
1077
1104
|
}
|
|
1078
1105
|
|
|
1079
|
-
|
|
1106
|
+
pimpl->model.reset(model);
|
|
1080
1107
|
|
|
1081
1108
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1082
1109
|
|
|
1083
|
-
|
|
1110
|
+
// updates params.sampling
|
|
1111
|
+
// TODO: fix naming
|
|
1112
|
+
common_init_sampler_from_model(model, params.sampling);
|
|
1113
|
+
|
|
1114
|
+
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
|
1115
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
1116
|
+
params.sampling.ignore_eos = false;
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
// initialize once
|
|
1120
|
+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
1121
|
+
if (llama_vocab_is_eog(vocab, i)) {
|
|
1122
|
+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
|
1123
|
+
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
if (params.sampling.ignore_eos) {
|
|
1128
|
+
// add EOG biases to the active set of logit biases
|
|
1129
|
+
params.sampling.logit_bias.insert(
|
|
1130
|
+
params.sampling.logit_bias.end(),
|
|
1131
|
+
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
//if (params.sampling.penalty_last_n == -1) {
|
|
1135
|
+
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
1136
|
+
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
1137
|
+
//}
|
|
1138
|
+
|
|
1139
|
+
//if (params.sampling.dry_penalty_last_n == -1) {
|
|
1140
|
+
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
1141
|
+
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
1142
|
+
//}
|
|
1143
|
+
|
|
1144
|
+
pimpl->samplers.resize(cparams.n_seq_max);
|
|
1145
|
+
|
|
1146
|
+
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
|
1147
|
+
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
|
1148
|
+
}
|
|
1084
1149
|
|
|
1085
1150
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
1086
1151
|
if (lctx == NULL) {
|
|
1087
|
-
LOG_ERR("%s: failed to create context with model '%s'
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1152
|
+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
|
1153
|
+
return;
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
pimpl->context.reset(lctx);
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
llama_model * common_init_result::model() {
|
|
1160
|
+
return pimpl->model.get();
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
llama_context * common_init_result::context() {
|
|
1164
|
+
return pimpl->context.get();
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
|
1168
|
+
return pimpl->samplers[seq_id].get();
|
|
1169
|
+
}
|
|
1170
|
+
|
|
1171
|
+
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
|
1172
|
+
return pimpl->lora;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
void common_init_result::free_context() {
|
|
1176
|
+
pimpl->context.reset();
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
common_init_result_ptr common_init_from_params(common_params & params) {
|
|
1180
|
+
common_init_result_ptr res(new common_init_result(params));
|
|
1181
|
+
|
|
1182
|
+
llama_model * model = res->model();
|
|
1183
|
+
if (model == NULL) {
|
|
1184
|
+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
|
1185
|
+
return res;
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
llama_context * lctx = res->context();
|
|
1189
|
+
if (lctx == NULL) {
|
|
1190
|
+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
|
1191
|
+
return res;
|
|
1091
1192
|
}
|
|
1092
1193
|
|
|
1194
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1195
|
+
|
|
1093
1196
|
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
|
1094
1197
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
|
1095
1198
|
params.ctx_shift = false;
|
|
@@ -1101,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1101
1204
|
|
|
1102
1205
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
|
1103
1206
|
if (cvec.n_embd == -1) {
|
|
1104
|
-
|
|
1105
|
-
llama_model_free(model);
|
|
1106
|
-
|
|
1107
|
-
return iparams;
|
|
1207
|
+
return res;
|
|
1108
1208
|
}
|
|
1109
1209
|
|
|
1110
1210
|
int err = llama_apply_adapter_cvec(
|
|
@@ -1115,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1115
1215
|
params.control_vector_layer_start,
|
|
1116
1216
|
params.control_vector_layer_end);
|
|
1117
1217
|
if (err) {
|
|
1118
|
-
|
|
1119
|
-
llama_model_free(model);
|
|
1120
|
-
|
|
1121
|
-
return iparams;
|
|
1218
|
+
return res;
|
|
1122
1219
|
}
|
|
1123
1220
|
}
|
|
1124
1221
|
|
|
@@ -1142,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1142
1239
|
}
|
|
1143
1240
|
|
|
1144
1241
|
if (!ok) {
|
|
1145
|
-
|
|
1146
|
-
llama_model_free(model);
|
|
1147
|
-
|
|
1148
|
-
return iparams;
|
|
1242
|
+
return res;
|
|
1149
1243
|
}
|
|
1150
1244
|
}
|
|
1151
1245
|
|
|
@@ -1155,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1155
1249
|
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
|
1156
1250
|
if (lora == nullptr) {
|
|
1157
1251
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
|
1158
|
-
|
|
1159
|
-
llama_model_free(model);
|
|
1160
|
-
return iparams;
|
|
1252
|
+
return res;
|
|
1161
1253
|
}
|
|
1162
1254
|
|
|
1163
1255
|
char buf[1024];
|
|
@@ -1166,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1166
1258
|
la.task_name = buf;
|
|
1167
1259
|
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
1168
1260
|
la.prompt_prefix = buf;
|
|
1169
|
-
|
|
1261
|
+
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
1170
1262
|
}
|
|
1171
1263
|
|
|
1172
1264
|
if (!params.lora_init_without_apply) {
|
|
1173
1265
|
common_set_adapter_lora(lctx, params.lora_adapters);
|
|
1174
1266
|
}
|
|
1175
1267
|
|
|
1176
|
-
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
|
1177
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
1178
|
-
params.sampling.ignore_eos = false;
|
|
1179
|
-
}
|
|
1180
|
-
|
|
1181
|
-
// initialize once
|
|
1182
|
-
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
1183
|
-
if (llama_vocab_is_eog(vocab, i)) {
|
|
1184
|
-
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
1185
|
-
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
1186
|
-
}
|
|
1187
|
-
}
|
|
1188
|
-
|
|
1189
|
-
if (params.sampling.ignore_eos) {
|
|
1190
|
-
// add EOG biases to the active set of logit biases
|
|
1191
|
-
params.sampling.logit_bias.insert(
|
|
1192
|
-
params.sampling.logit_bias.end(),
|
|
1193
|
-
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
1194
|
-
}
|
|
1195
|
-
|
|
1196
|
-
if (params.sampling.penalty_last_n == -1) {
|
|
1197
|
-
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
1198
|
-
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
1199
|
-
}
|
|
1200
|
-
|
|
1201
|
-
if (params.sampling.dry_penalty_last_n == -1) {
|
|
1202
|
-
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
1203
|
-
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
1204
|
-
}
|
|
1205
|
-
|
|
1206
1268
|
if (params.warmup) {
|
|
1207
1269
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
|
1208
1270
|
|
|
@@ -1241,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1241
1303
|
llama_set_warmup(lctx, false);
|
|
1242
1304
|
}
|
|
1243
1305
|
|
|
1244
|
-
|
|
1245
|
-
iparams.context.reset(lctx);
|
|
1246
|
-
|
|
1247
|
-
return iparams;
|
|
1306
|
+
return res;
|
|
1248
1307
|
}
|
|
1249
1308
|
|
|
1309
|
+
common_init_result::~common_init_result() = default;
|
|
1310
|
+
|
|
1250
1311
|
std::string get_model_endpoint() {
|
|
1251
1312
|
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
|
1252
1313
|
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
|
@@ -1255,7 +1316,9 @@ std::string get_model_endpoint() {
|
|
|
1255
1316
|
std::string model_endpoint = "https://huggingface.co/";
|
|
1256
1317
|
if (endpoint_env) {
|
|
1257
1318
|
model_endpoint = endpoint_env;
|
|
1258
|
-
if (model_endpoint.back() != '/')
|
|
1319
|
+
if (model_endpoint.back() != '/') {
|
|
1320
|
+
model_endpoint += '/';
|
|
1321
|
+
}
|
|
1259
1322
|
}
|
|
1260
1323
|
return model_endpoint;
|
|
1261
1324
|
}
|
|
@@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
|
|
|
82
82
|
enum llama_example {
|
|
83
83
|
LLAMA_EXAMPLE_COMMON,
|
|
84
84
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
85
|
-
|
|
85
|
+
LLAMA_EXAMPLE_COMPLETION,
|
|
86
|
+
LLAMA_EXAMPLE_CLI,
|
|
86
87
|
LLAMA_EXAMPLE_EMBEDDING,
|
|
87
88
|
LLAMA_EXAMPLE_PERPLEXITY,
|
|
88
89
|
LLAMA_EXAMPLE_RETRIEVAL,
|
|
@@ -98,6 +99,7 @@ enum llama_example {
|
|
|
98
99
|
LLAMA_EXAMPLE_TTS,
|
|
99
100
|
LLAMA_EXAMPLE_DIFFUSION,
|
|
100
101
|
LLAMA_EXAMPLE_FINETUNE,
|
|
102
|
+
LLAMA_EXAMPLE_FIT_PARAMS,
|
|
101
103
|
|
|
102
104
|
LLAMA_EXAMPLE_COUNT,
|
|
103
105
|
};
|
|
@@ -194,7 +196,6 @@ struct common_params_sampling {
|
|
|
194
196
|
|
|
195
197
|
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
|
196
198
|
|
|
197
|
-
|
|
198
199
|
std::vector<enum common_sampler_type> samplers = {
|
|
199
200
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
|
200
201
|
COMMON_SAMPLER_TYPE_DRY,
|
|
@@ -215,6 +216,10 @@ struct common_params_sampling {
|
|
|
215
216
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
216
217
|
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
|
217
218
|
|
|
219
|
+
bool has_logit_bias() const {
|
|
220
|
+
return !logit_bias.empty();
|
|
221
|
+
}
|
|
222
|
+
|
|
218
223
|
// print the parameters into a string
|
|
219
224
|
std::string print() const;
|
|
220
225
|
};
|
|
@@ -303,8 +308,8 @@ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
|
303
308
|
|
|
304
309
|
struct common_params {
|
|
305
310
|
bool vocab_only = false;
|
|
306
|
-
int32_t n_predict = -1; // new tokens to predict
|
|
307
|
-
int32_t n_ctx =
|
|
311
|
+
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
|
312
|
+
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
|
308
313
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
309
314
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
|
310
315
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
@@ -325,9 +330,12 @@ struct common_params {
|
|
|
325
330
|
// offload params
|
|
326
331
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
327
332
|
|
|
328
|
-
int32_t n_gpu_layers
|
|
329
|
-
int32_t main_gpu
|
|
330
|
-
float tensor_split[128]
|
|
333
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
334
|
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
335
|
+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
336
|
+
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
|
337
|
+
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
|
338
|
+
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
|
331
339
|
|
|
332
340
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
333
341
|
|
|
@@ -407,6 +415,7 @@ struct common_params {
|
|
|
407
415
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
408
416
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
409
417
|
bool no_perf = false; // disable performance metrics
|
|
418
|
+
bool show_timings = true; // show timing information on CLI
|
|
410
419
|
bool ctx_shift = false; // context shift on infinite text generation
|
|
411
420
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
412
421
|
bool kv_unified = false; // enable unified KV cache
|
|
@@ -463,7 +472,7 @@ struct common_params {
|
|
|
463
472
|
std::string public_path = ""; // NOLINT
|
|
464
473
|
std::string api_prefix = ""; // NOLINT
|
|
465
474
|
std::string chat_template = ""; // NOLINT
|
|
466
|
-
bool use_jinja =
|
|
475
|
+
bool use_jinja = true; // NOLINT
|
|
467
476
|
bool enable_chat_template = true;
|
|
468
477
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
469
478
|
int reasoning_budget = -1;
|
|
@@ -483,9 +492,10 @@ struct common_params {
|
|
|
483
492
|
bool endpoint_metrics = false;
|
|
484
493
|
|
|
485
494
|
// router server configs
|
|
486
|
-
std::string models_dir
|
|
487
|
-
|
|
488
|
-
|
|
495
|
+
std::string models_dir = ""; // directory containing models for the router server
|
|
496
|
+
std::string models_preset = ""; // directory containing model presets for the router server
|
|
497
|
+
int models_max = 4; // maximum number of models to load simultaneously
|
|
498
|
+
bool models_autoload = true; // automatically load models when requested via the router server
|
|
489
499
|
|
|
490
500
|
bool log_json = false;
|
|
491
501
|
|
|
@@ -667,15 +677,29 @@ bool tty_can_use_colors();
|
|
|
667
677
|
// Model utils
|
|
668
678
|
//
|
|
669
679
|
|
|
670
|
-
|
|
680
|
+
struct common_sampler;
|
|
681
|
+
|
|
682
|
+
// note: defines the model, context, samplers, ets. lifetimes
|
|
671
683
|
struct common_init_result {
|
|
672
|
-
|
|
673
|
-
|
|
684
|
+
common_init_result(common_params & params);
|
|
685
|
+
~common_init_result();
|
|
674
686
|
|
|
675
|
-
|
|
687
|
+
llama_model * model();
|
|
688
|
+
llama_context * context();
|
|
689
|
+
common_sampler * sampler(llama_seq_id seq_id);
|
|
690
|
+
|
|
691
|
+
std::vector<llama_adapter_lora_ptr> & lora();
|
|
692
|
+
|
|
693
|
+
void free_context();
|
|
694
|
+
|
|
695
|
+
private:
|
|
696
|
+
struct impl;
|
|
697
|
+
std::unique_ptr<impl> pimpl;
|
|
676
698
|
};
|
|
677
699
|
|
|
678
|
-
|
|
700
|
+
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
|
701
|
+
|
|
702
|
+
common_init_result_ptr common_init_from_params(common_params & params);
|
|
679
703
|
|
|
680
704
|
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
|
681
705
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|