@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -1013,31 +1013,40 @@ bool tty_can_use_colors() {
1013
1013
  // Model utils
1014
1014
  //
1015
1015
 
1016
- static inline void common_init_sampler_from_model(
1016
+ // TODO: move to common/sampling
1017
+ static void common_init_sampler_from_model(
1017
1018
  const llama_model * model,
1018
1019
  common_params_sampling & sparams) {
1019
1020
 
1020
1021
  const uint64_t config = sparams.user_sampling_config;
1021
1022
 
1022
1023
  auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
1023
- if (config & user_config) return;
1024
+ if (config & user_config) {
1025
+ return;
1026
+ }
1024
1027
 
1025
1028
  char buf[64] = {0};
1026
1029
  if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
1027
1030
  char * end = nullptr;
1028
1031
  int32_t v = strtol(buf, &end, 10);
1029
- if (end && end != buf) dst = v;
1032
+ if (end && end != buf) {
1033
+ dst = v;
1034
+ }
1030
1035
  }
1031
1036
  };
1032
1037
 
1033
1038
  auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
1034
- if (config & user_config) return;
1039
+ if (config & user_config) {
1040
+ return;
1041
+ }
1035
1042
 
1036
1043
  char buf[128] = {0};
1037
1044
  if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
1038
1045
  char * end = nullptr;
1039
1046
  float v = strtof(buf, &end);
1040
- if (end && end != buf) dst = v;
1047
+ if (end && end != buf) {
1048
+ dst = v;
1049
+ }
1041
1050
  }
1042
1051
  };
1043
1052
 
@@ -1065,31 +1074,125 @@ static inline void common_init_sampler_from_model(
1065
1074
  get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
1066
1075
  }
1067
1076
 
1068
- struct common_init_result common_init_from_params(common_params & params) {
1069
- common_init_result iparams;
1077
+ struct common_init_result::impl {
1078
+ impl() = default;
1079
+ ~impl() = default;
1080
+
1081
+ llama_model_ptr model;
1082
+ llama_context_ptr context;
1083
+
1084
+ std::vector<llama_adapter_lora_ptr> lora;
1085
+
1086
+ std::vector<common_sampler_ptr> samplers;
1087
+ };
1088
+
1089
+ common_init_result::common_init_result(common_params & params) :
1090
+ pimpl(new impl{}) {
1070
1091
  auto mparams = common_model_params_to_llama(params);
1092
+ auto cparams = common_context_params_to_llama(params);
1093
+
1094
+ if (params.fit_params) {
1095
+ LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
1096
+ llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
1097
+ params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
1098
+ params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
1099
+ }
1071
1100
 
1072
1101
  llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
1073
1102
  if (model == NULL) {
1074
- LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
1075
- __func__, params.model.path.c_str());
1076
- return iparams;
1103
+ return;
1077
1104
  }
1078
1105
 
1079
- common_init_sampler_from_model(model, params.sampling);
1106
+ pimpl->model.reset(model);
1080
1107
 
1081
1108
  const llama_vocab * vocab = llama_model_get_vocab(model);
1082
1109
 
1083
- auto cparams = common_context_params_to_llama(params);
1110
+ // updates params.sampling
1111
+ // TODO: fix naming
1112
+ common_init_sampler_from_model(model, params.sampling);
1113
+
1114
+ if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
1115
+ LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
1116
+ params.sampling.ignore_eos = false;
1117
+ }
1118
+
1119
+ // initialize once
1120
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1121
+ if (llama_vocab_is_eog(vocab, i)) {
1122
+ LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
1123
+ params.sampling.logit_bias_eog.push_back({i, -INFINITY});
1124
+ }
1125
+ }
1126
+
1127
+ if (params.sampling.ignore_eos) {
1128
+ // add EOG biases to the active set of logit biases
1129
+ params.sampling.logit_bias.insert(
1130
+ params.sampling.logit_bias.end(),
1131
+ params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
1132
+ }
1133
+
1134
+ //if (params.sampling.penalty_last_n == -1) {
1135
+ // LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1136
+ // params.sampling.penalty_last_n = llama_n_ctx(lctx);
1137
+ //}
1138
+
1139
+ //if (params.sampling.dry_penalty_last_n == -1) {
1140
+ // LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1141
+ // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
1142
+ //}
1143
+
1144
+ pimpl->samplers.resize(cparams.n_seq_max);
1145
+
1146
+ for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
1147
+ pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
1148
+ }
1084
1149
 
1085
1150
  llama_context * lctx = llama_init_from_model(model, cparams);
1086
1151
  if (lctx == NULL) {
1087
- LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
1088
- __func__, params.model.path.c_str());
1089
- llama_model_free(model);
1090
- return iparams;
1152
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
1153
+ return;
1154
+ }
1155
+
1156
+ pimpl->context.reset(lctx);
1157
+ }
1158
+
1159
+ llama_model * common_init_result::model() {
1160
+ return pimpl->model.get();
1161
+ }
1162
+
1163
+ llama_context * common_init_result::context() {
1164
+ return pimpl->context.get();
1165
+ }
1166
+
1167
+ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
1168
+ return pimpl->samplers[seq_id].get();
1169
+ }
1170
+
1171
+ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
1172
+ return pimpl->lora;
1173
+ }
1174
+
1175
+ void common_init_result::free_context() {
1176
+ pimpl->context.reset();
1177
+ }
1178
+
1179
+ common_init_result_ptr common_init_from_params(common_params & params) {
1180
+ common_init_result_ptr res(new common_init_result(params));
1181
+
1182
+ llama_model * model = res->model();
1183
+ if (model == NULL) {
1184
+ LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
1185
+ return res;
1186
+ }
1187
+
1188
+ llama_context * lctx = res->context();
1189
+ if (lctx == NULL) {
1190
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
1191
+ return res;
1091
1192
  }
1092
1193
 
1194
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1195
+
1093
1196
  if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
1094
1197
  LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
1095
1198
  params.ctx_shift = false;
@@ -1101,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1101
1204
 
1102
1205
  const auto cvec = common_control_vector_load(params.control_vectors);
1103
1206
  if (cvec.n_embd == -1) {
1104
- llama_free(lctx);
1105
- llama_model_free(model);
1106
-
1107
- return iparams;
1207
+ return res;
1108
1208
  }
1109
1209
 
1110
1210
  int err = llama_apply_adapter_cvec(
@@ -1115,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1115
1215
  params.control_vector_layer_start,
1116
1216
  params.control_vector_layer_end);
1117
1217
  if (err) {
1118
- llama_free(lctx);
1119
- llama_model_free(model);
1120
-
1121
- return iparams;
1218
+ return res;
1122
1219
  }
1123
1220
  }
1124
1221
 
@@ -1142,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1142
1239
  }
1143
1240
 
1144
1241
  if (!ok) {
1145
- llama_free(lctx);
1146
- llama_model_free(model);
1147
-
1148
- return iparams;
1242
+ return res;
1149
1243
  }
1150
1244
  }
1151
1245
 
@@ -1155,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1155
1249
  lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
1156
1250
  if (lora == nullptr) {
1157
1251
  LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
1158
- llama_free(lctx);
1159
- llama_model_free(model);
1160
- return iparams;
1252
+ return res;
1161
1253
  }
1162
1254
 
1163
1255
  char buf[1024];
@@ -1166,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
1166
1258
  la.task_name = buf;
1167
1259
  llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
1168
1260
  la.prompt_prefix = buf;
1169
- iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
1261
+ res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
1170
1262
  }
1171
1263
 
1172
1264
  if (!params.lora_init_without_apply) {
1173
1265
  common_set_adapter_lora(lctx, params.lora_adapters);
1174
1266
  }
1175
1267
 
1176
- if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
1177
- LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
1178
- params.sampling.ignore_eos = false;
1179
- }
1180
-
1181
- // initialize once
1182
- for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1183
- if (llama_vocab_is_eog(vocab, i)) {
1184
- LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1185
- params.sampling.logit_bias_eog.push_back({i, -INFINITY});
1186
- }
1187
- }
1188
-
1189
- if (params.sampling.ignore_eos) {
1190
- // add EOG biases to the active set of logit biases
1191
- params.sampling.logit_bias.insert(
1192
- params.sampling.logit_bias.end(),
1193
- params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
1194
- }
1195
-
1196
- if (params.sampling.penalty_last_n == -1) {
1197
- LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1198
- params.sampling.penalty_last_n = llama_n_ctx(lctx);
1199
- }
1200
-
1201
- if (params.sampling.dry_penalty_last_n == -1) {
1202
- LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1203
- params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
1204
- }
1205
-
1206
1268
  if (params.warmup) {
1207
1269
  LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
1208
1270
 
@@ -1241,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
1241
1303
  llama_set_warmup(lctx, false);
1242
1304
  }
1243
1305
 
1244
- iparams.model.reset(model);
1245
- iparams.context.reset(lctx);
1246
-
1247
- return iparams;
1306
+ return res;
1248
1307
  }
1249
1308
 
1309
+ common_init_result::~common_init_result() = default;
1310
+
1250
1311
  std::string get_model_endpoint() {
1251
1312
  const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
1252
1313
  // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
@@ -1255,7 +1316,9 @@ std::string get_model_endpoint() {
1255
1316
  std::string model_endpoint = "https://huggingface.co/";
1256
1317
  if (endpoint_env) {
1257
1318
  model_endpoint = endpoint_env;
1258
- if (model_endpoint.back() != '/') model_endpoint += '/';
1319
+ if (model_endpoint.back() != '/') {
1320
+ model_endpoint += '/';
1321
+ }
1259
1322
  }
1260
1323
  return model_endpoint;
1261
1324
  }
@@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
82
82
  enum llama_example {
83
83
  LLAMA_EXAMPLE_COMMON,
84
84
  LLAMA_EXAMPLE_SPECULATIVE,
85
- LLAMA_EXAMPLE_MAIN,
85
+ LLAMA_EXAMPLE_COMPLETION,
86
+ LLAMA_EXAMPLE_CLI,
86
87
  LLAMA_EXAMPLE_EMBEDDING,
87
88
  LLAMA_EXAMPLE_PERPLEXITY,
88
89
  LLAMA_EXAMPLE_RETRIEVAL,
@@ -98,6 +99,7 @@ enum llama_example {
98
99
  LLAMA_EXAMPLE_TTS,
99
100
  LLAMA_EXAMPLE_DIFFUSION,
100
101
  LLAMA_EXAMPLE_FINETUNE,
102
+ LLAMA_EXAMPLE_FIT_PARAMS,
101
103
 
102
104
  LLAMA_EXAMPLE_COUNT,
103
105
  };
@@ -194,7 +196,6 @@ struct common_params_sampling {
194
196
 
195
197
  std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
196
198
 
197
-
198
199
  std::vector<enum common_sampler_type> samplers = {
199
200
  COMMON_SAMPLER_TYPE_PENALTIES,
200
201
  COMMON_SAMPLER_TYPE_DRY,
@@ -215,6 +216,10 @@ struct common_params_sampling {
215
216
  std::vector<llama_logit_bias> logit_bias; // logit biases to apply
216
217
  std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
217
218
 
219
+ bool has_logit_bias() const {
220
+ return !logit_bias.empty();
221
+ }
222
+
218
223
  // print the parameters into a string
219
224
  std::string print() const;
220
225
  };
@@ -303,8 +308,8 @@ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
303
308
 
304
309
  struct common_params {
305
310
  bool vocab_only = false;
306
- int32_t n_predict = -1; // new tokens to predict
307
- int32_t n_ctx = 4096; // context size
311
+ int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
312
+ int32_t n_ctx = 0; // context size, 0 == context the model was trained with
308
313
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
309
314
  int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
310
315
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -325,9 +330,12 @@ struct common_params {
325
330
  // offload params
326
331
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
327
332
 
328
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
329
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
330
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
333
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
334
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
335
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
336
+ bool fit_params = true; // whether to fit unset model/context parameters to free device memory
337
+ size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
338
+ int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
331
339
 
332
340
  enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
333
341
 
@@ -407,6 +415,7 @@ struct common_params {
407
415
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
408
416
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
409
417
  bool no_perf = false; // disable performance metrics
418
+ bool show_timings = true; // show timing information on CLI
410
419
  bool ctx_shift = false; // context shift on infinite text generation
411
420
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
412
421
  bool kv_unified = false; // enable unified KV cache
@@ -463,7 +472,7 @@ struct common_params {
463
472
  std::string public_path = ""; // NOLINT
464
473
  std::string api_prefix = ""; // NOLINT
465
474
  std::string chat_template = ""; // NOLINT
466
- bool use_jinja = false; // NOLINT
475
+ bool use_jinja = true; // NOLINT
467
476
  bool enable_chat_template = true;
468
477
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
469
478
  int reasoning_budget = -1;
@@ -483,9 +492,10 @@ struct common_params {
483
492
  bool endpoint_metrics = false;
484
493
 
485
494
  // router server configs
486
- std::string models_dir = ""; // directory containing models for the router server
487
- int models_max = 4; // maximum number of models to load simultaneously
488
- bool models_autoload = true; // automatically load models when requested via the router server
495
+ std::string models_dir = ""; // directory containing models for the router server
496
+ std::string models_preset = ""; // directory containing model presets for the router server
497
+ int models_max = 4; // maximum number of models to load simultaneously
498
+ bool models_autoload = true; // automatically load models when requested via the router server
489
499
 
490
500
  bool log_json = false;
491
501
 
@@ -667,15 +677,29 @@ bool tty_can_use_colors();
667
677
  // Model utils
668
678
  //
669
679
 
670
- // note: defines object's lifetime
680
+ struct common_sampler;
681
+
682
+ // note: defines the model, context, samplers, ets. lifetimes
671
683
  struct common_init_result {
672
- llama_model_ptr model;
673
- llama_context_ptr context;
684
+ common_init_result(common_params & params);
685
+ ~common_init_result();
674
686
 
675
- std::vector<llama_adapter_lora_ptr> lora;
687
+ llama_model * model();
688
+ llama_context * context();
689
+ common_sampler * sampler(llama_seq_id seq_id);
690
+
691
+ std::vector<llama_adapter_lora_ptr> & lora();
692
+
693
+ void free_context();
694
+
695
+ private:
696
+ struct impl;
697
+ std::unique_ptr<impl> pimpl;
676
698
  };
677
699
 
678
- struct common_init_result common_init_from_params(common_params & params);
700
+ using common_init_result_ptr = std::unique_ptr<common_init_result>;
701
+
702
+ common_init_result_ptr common_init_from_params(common_params & params);
679
703
 
680
704
  struct llama_model_params common_model_params_to_llama ( common_params & params);
681
705
  struct llama_context_params common_context_params_to_llama(const common_params & params);