@fugood/llama.node 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.2.4",
4
+ "version": "1.2.5",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.2.4",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.2.4",
77
- "@fugood/node-llama-linux-x64-cuda": "1.2.4",
78
- "@fugood/node-llama-linux-arm64": "1.2.4",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.2.4",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.2.4",
81
- "@fugood/node-llama-win32-x64": "1.2.4",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.2.4",
83
- "@fugood/node-llama-win32-x64-cuda": "1.2.4",
84
- "@fugood/node-llama-win32-arm64": "1.2.4",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.2.4",
86
- "@fugood/node-llama-darwin-x64": "1.2.4",
87
- "@fugood/node-llama-darwin-arm64": "1.2.4"
75
+ "@fugood/node-llama-linux-x64": "1.2.5",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.2.5",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.2.5",
78
+ "@fugood/node-llama-linux-arm64": "1.2.5",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.2.5",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.2.5",
81
+ "@fugood/node-llama-win32-x64": "1.2.5",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.2.5",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.2.5",
84
+ "@fugood/node-llama-win32-arm64": "1.2.5",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.2.5",
86
+ "@fugood/node-llama-darwin-x64": "1.2.5",
87
+ "@fugood/node-llama-darwin-arm64": "1.2.5"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -3358,7 +3358,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3358
3358
  add_opt(common_arg(
3359
3359
  {"--chat-template-kwargs"}, "STRING",
3360
3360
  string_format("sets additional params for the json template parser"),
3361
- [](common_params & params, const std::string & value) {
3361
+ [](common_params & params, const std::string & value) {
3362
3362
  auto parsed = json::parse(value);
3363
3363
  for (const auto & item : parsed.items()) {
3364
3364
  params.default_template_kwargs[item.key()] = item.value().dump();
@@ -3570,21 +3570,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3570
3570
  common_log_set_file(common_log_main(), value.c_str());
3571
3571
  }
3572
3572
  ));
3573
- add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3574
- "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3575
- "'auto' enables colors when output is to a terminal",
3576
- [](common_params &, const std::string & value) {
3577
- if (is_truthy(value)) {
3578
- common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3579
- } else if (is_falsey(value)) {
3580
- common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3581
- } else if (is_autoy(value)) {
3582
- common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3583
- } else {
3584
- throw std::invalid_argument(
3585
- string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3586
- }
3587
- }).set_env("LLAMA_LOG_COLORS"));
3573
+ add_opt(common_arg(
3574
+ {"--log-colors"}, "[on|off|auto]",
3575
+ "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3576
+ "'auto' enables colors when output is to a terminal",
3577
+ [](common_params &, const std::string & value) {
3578
+ if (is_truthy(value)) {
3579
+ common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3580
+ } else if (is_falsey(value)) {
3581
+ common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3582
+ } else if (is_autoy(value)) {
3583
+ common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3584
+ } else {
3585
+ throw std::invalid_argument(
3586
+ string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3587
+ }
3588
+ }
3589
+ ).set_env("LLAMA_LOG_COLORS"));
3588
3590
  add_opt(common_arg(
3589
3591
  {"-v", "--verbose", "--log-verbose"},
3590
3592
  "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3850,7 +3852,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3850
3852
  }
3851
3853
  ).set_examples({LLAMA_EXAMPLE_TTS}));
3852
3854
 
3853
- // model-specific
3855
+ add_opt(common_arg(
3856
+ {"--diffusion-steps"}, "N",
3857
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3858
+ [](common_params & params, int value) { params.diffusion.steps = value; }
3859
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3860
+ add_opt(common_arg(
3861
+ {"--diffusion-visual"},
3862
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
3863
+ [](common_params & params) { params.diffusion.visual_mode = true; }
3864
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3865
+ add_opt(common_arg(
3866
+ {"--diffusion-eps"}, "F",
3867
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3868
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3869
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3870
+ add_opt(common_arg(
3871
+ {"--diffusion-algorithm"}, "N",
3872
+ string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
3873
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
3874
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3875
+ add_opt(common_arg(
3876
+ {"--diffusion-alg-temp"}, "F",
3877
+ string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3878
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3879
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3880
+ add_opt(common_arg(
3881
+ {"--diffusion-block-length"}, "N",
3882
+ string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3883
+ [](common_params & params, int value) { params.diffusion.block_length = value; }
3884
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3885
+ add_opt(common_arg(
3886
+ {"--diffusion-cfg-scale"}, "F",
3887
+ string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3888
+ [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3889
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3890
+ add_opt(common_arg(
3891
+ {"--diffusion-add-gumbel-noise"}, "F",
3892
+ string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3893
+ [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3894
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3895
+ add_opt(common_arg(
3896
+ { "-lr", "--learning-rate" }, "ALPHA",
3897
+ string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
3898
+ [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
3899
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3900
+ add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3901
+ string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3902
+ (double) params.lr.lr_min),
3903
+ [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
3904
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3905
+ add_opt(common_arg(
3906
+ {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
3907
+ string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
3908
+ [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
3909
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3910
+ add_opt(common_arg(
3911
+ {"-wd", "--weight-decay"}, "WD",
3912
+ string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
3913
+ [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
3914
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3915
+ add_opt(common_arg(
3916
+ {"-val-split", "--val-split"}, "FRACTION",
3917
+ string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
3918
+ [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
3919
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3920
+ add_opt(common_arg(
3921
+ {"-epochs", "--epochs"}, "N",
3922
+ string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3923
+ [](common_params & params, int epochs) { params.lr.epochs = epochs; }
3924
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3925
+ add_opt(common_arg(
3926
+ {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
3927
+ [](common_params & params, const std::string & name) {
3928
+ params.optimizer = common_opt_get_optimizer(name.c_str());
3929
+ if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3930
+ throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3931
+ }
3932
+ }
3933
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3934
+
3935
+ // presets
3854
3936
  add_opt(common_arg(
3855
3937
  {"--tts-oute-default"},
3856
3938
  string_format("use default OuteTTS models (note: can download weights from the internet)"),
@@ -3863,39 +3945,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3863
3945
  ).set_examples({LLAMA_EXAMPLE_TTS}));
3864
3946
 
3865
3947
  add_opt(common_arg(
3866
- {"--embd-bge-small-en-default"},
3867
- string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
3868
- [](common_params & params) {
3869
- params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3870
- params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3871
- params.embd_normalize = 2;
3872
- params.n_ctx = 512;
3873
- params.verbose_prompt = true;
3874
- params.embedding = true;
3875
- }
3876
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3877
-
3878
- add_opt(common_arg(
3879
- {"--embd-e5-small-en-default"},
3880
- string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
3881
- [](common_params & params) {
3882
- params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3883
- params.model.hf_file = "e5-small-v2-q8_0.gguf";
3884
- params.embd_normalize = 2;
3885
- params.n_ctx = 512;
3886
- params.verbose_prompt = true;
3887
- params.embedding = true;
3888
- }
3889
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3890
-
3891
- add_opt(common_arg(
3892
- {"--embd-gte-small-default"},
3893
- string_format("use default gte-small model (note: can download weights from the internet)"),
3948
+ {"--embd-gemma-default"},
3949
+ string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
3894
3950
  [](common_params & params) {
3895
- params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3896
- params.model.hf_file = "gte-small-q8_0.gguf";
3897
- params.embd_normalize = 2;
3898
- params.n_ctx = 512;
3951
+ params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
3952
+ params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
3953
+ params.port = 8011;
3954
+ params.n_ubatch = 2048;
3955
+ params.n_batch = 2048;
3956
+ params.n_parallel = 32;
3957
+ params.n_ctx = 2048*params.n_parallel;
3899
3958
  params.verbose_prompt = true;
3900
3959
  params.embedding = true;
3901
3960
  }
@@ -3990,96 +4049,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3990
4049
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
3991
4050
 
3992
4051
  add_opt(common_arg(
3993
- { "--diffusion-steps" }, "N",
3994
- string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3995
- [](common_params & params, int value) { params.diffusion.steps = value; }
3996
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3997
- add_opt(common_arg(
3998
- { "--diffusion-visual" },
3999
- string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
4000
- params.diffusion.visual_mode ? "true" : "false"),
4001
- [](common_params & params) { params.diffusion.visual_mode = true; }
4002
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4052
+ {"--gpt-oss-20b-default"},
4053
+ string_format("use gpt-oss-20b (note: can download weights from the internet)"),
4054
+ [](common_params & params) {
4055
+ params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
4056
+ params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
4057
+ params.port = 8013;
4058
+ params.n_ubatch = 2048;
4059
+ params.n_batch = 32768;
4060
+ params.n_parallel = 2;
4061
+ params.n_ctx = 131072*params.n_parallel;
4062
+ params.sampling.temp = 1.0f;
4063
+ params.sampling.top_p = 1.0f;
4064
+ params.sampling.top_k = 0;
4065
+ params.sampling.min_p = 0.01f;
4066
+ params.use_jinja = true;
4067
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4068
+ }
4069
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
4003
4070
 
4004
4071
  add_opt(common_arg(
4005
- { "--diffusion-eps" }, "F",
4006
- string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
4007
- [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
4008
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4009
- add_opt(common_arg(
4010
- { "--diffusion-algorithm" }, "N",
4011
- string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
4012
- params.diffusion.algorithm),
4013
- [](common_params & params, int value) { params.diffusion.algorithm = value; }
4014
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4015
- add_opt(common_arg(
4016
- { "--diffusion-alg-temp" }, "F",
4017
- string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
4018
- [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
4019
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4072
+ {"--gpt-oss-120b-default"},
4073
+ string_format("use gpt-oss-120b (note: can download weights from the internet)"),
4074
+ [](common_params & params) {
4075
+ params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
4076
+ params.port = 8013;
4077
+ params.n_ubatch = 2048;
4078
+ params.n_batch = 32768;
4079
+ params.n_parallel = 2;
4080
+ params.n_ctx = 131072*params.n_parallel;
4081
+ params.sampling.temp = 1.0f;
4082
+ params.sampling.top_p = 1.0f;
4083
+ params.sampling.top_k = 0;
4084
+ params.sampling.min_p = 0.01f;
4085
+ params.use_jinja = true;
4086
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4087
+ }
4088
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
4020
4089
 
4021
4090
  add_opt(common_arg(
4022
- { "--diffusion-block-length" }, "N",
4023
- string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
4024
- [](common_params & params, int value) { params.diffusion.block_length = value; }
4025
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4026
- add_opt(common_arg(
4027
- { "--diffusion-cfg-scale" }, "F",
4028
- string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
4029
- [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
4030
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4031
- add_opt(common_arg(
4032
- { "--diffusion-add-gumbel-noise" }, "F",
4033
- string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
4034
- [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
4035
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4036
-
4091
+ {"--vision-gemma-4b-default"},
4092
+ string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
4093
+ [](common_params & params) {
4094
+ params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
4095
+ params.port = 8014;
4096
+ params.n_ctx = 0;
4097
+ params.use_jinja = true;
4098
+ }
4099
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
4037
4100
 
4038
- add_opt(
4039
- common_arg({ "-lr", "--learning-rate" }, "ALPHA",
4040
- string_format(
4041
- "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
4042
- (double) params.lr.lr0),
4043
- [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
4044
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4045
- add_opt(
4046
- common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
4047
- string_format(
4048
- "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
4049
- (double) params.lr.lr_min),
4050
- [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
4051
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4052
- add_opt(
4053
- common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
4054
- string_format(
4055
- "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
4056
- (double) params.lr.decay_epochs),
4057
- [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
4058
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4059
- add_opt(common_arg(
4060
- { "-wd", "--weight-decay" }, "WD",
4061
- string_format(
4062
- "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
4063
- (double) params.lr.wd),
4064
- [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
4065
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4066
- add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
4067
- string_format("fraction of data to use as validation set for training (default: %.2g).",
4068
- (double) params.val_split),
4069
- [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
4070
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4071
- add_opt(common_arg({ "-epochs", "--epochs" }, "N",
4072
- string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
4073
- [](common_params & params, int epochs) { params.lr.epochs = epochs; })
4074
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4075
- add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
4076
- [](common_params & params, const std::string & name) {
4077
- params.optimizer = common_opt_get_optimizer(name.c_str());
4078
- if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
4079
- throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
4080
- }
4081
- })
4082
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4101
+ add_opt(common_arg(
4102
+ {"--vision-gemma-12b-default"},
4103
+ string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
4104
+ [](common_params & params) {
4105
+ params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
4106
+ params.port = 8014;
4107
+ params.n_ctx = 0;
4108
+ params.use_jinja = true;
4109
+ }
4110
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
4083
4111
 
4084
4112
  return ctx_arg;
4085
4113
  }
@@ -432,7 +432,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
432
432
  if (is_arguments_path({})) {
433
433
  // Entire JSON is the arguments and was parsed fully.
434
434
  return consume_json_result {
435
- partial->json.dump(),
435
+ partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
436
436
  /* .is_partial = */ false,
437
437
  };
438
438
  }
@@ -444,7 +444,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
444
444
  std::vector<std::string> path;
445
445
  std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
446
446
  if (is_arguments_path(path)) {
447
- auto arguments = j.dump();
447
+ auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
448
448
  if (is_partial() && !partial->healing_marker.marker.empty()) {
449
449
  auto idx = arguments.find(partial->healing_marker.json_dump_marker);
450
450
  if (idx != std::string::npos) {
@@ -427,7 +427,7 @@ struct common_params {
427
427
  int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
428
428
  int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
429
429
  int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
430
- int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
430
+ int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
431
431
 
432
432
  std::string hostname = "127.0.0.1";
433
433
  std::string public_path = ""; // NOLINT
@@ -5,6 +5,7 @@
5
5
  #include <nlohmann/json.hpp>
6
6
 
7
7
  #include <string>
8
+ #include <regex>
8
9
 
9
10
  using json = nlohmann::ordered_json;
10
11
 
@@ -168,6 +169,47 @@ bool common_json_parse(
168
169
  }
169
170
  }
170
171
 
172
+ // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
173
+ static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
174
+
175
+ auto is_high_surrogate = [&](const std::string & s) {
176
+ // Check if a partial of a high surrogate (U+D800-U+DBFF)
177
+ return s.length() >= 4 &&
178
+ s[0] == '\\' && s[1] == 'u' &&
179
+ std::tolower(s[2]) == 'd' &&
180
+ (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
181
+ };
182
+
183
+ // Initialize the unicode marker to a low surrogate to handle the edge case
184
+ // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
185
+ // backslash (\)
186
+ std::string unicode_marker_padding = "udc00";
187
+ std::smatch last_unicode_seq;
188
+
189
+ if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
190
+ std::smatch second_last_seq;
191
+ std::string prelude = str.substr(0, last_unicode_seq.position());
192
+
193
+ // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
194
+ unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
195
+
196
+ if (is_high_surrogate(last_unicode_seq.str())) {
197
+ // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
198
+ unicode_marker_padding += "\\udc00";
199
+ } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
200
+ if (is_high_surrogate(second_last_seq.str())) {
201
+ // If this follows a high surrogate, pad it to be a low surrogate
202
+ if (last_unicode_seq.length() == 2) {
203
+ unicode_marker_padding = "dc00";
204
+ } else if (last_unicode_seq.length() == 3) {
205
+ unicode_marker_padding = "c00";
206
+ } else {
207
+ // The original unicode_marker_padding is already padded with 0s
208
+ }
209
+ }
210
+ }
211
+ }
212
+
171
213
  const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
172
214
 
173
215
  if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
@@ -186,6 +228,9 @@ bool common_json_parse(
186
228
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
187
229
  // Was inside an object value string after an escape
188
230
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
231
+ } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
232
+ // Was inside an object value string after a partial unicode escape
233
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
189
234
  } else {
190
235
  // find last :
191
236
  auto last_pos = str.find_last_of(':');
@@ -205,6 +250,9 @@ bool common_json_parse(
205
250
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
206
251
  // Was inside an array value string after an escape
207
252
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
253
+ } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
254
+ // Was inside an array value string after a partial unicode escape
255
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
208
256
  } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
209
257
  // Had just finished a value
210
258
  str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
@@ -230,6 +278,9 @@ bool common_json_parse(
230
278
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
231
279
  // Was inside an object key string after an escape
232
280
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
281
+ } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
282
+ // Was inside an object key string after a partial unicode escape
283
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
233
284
  } else {
234
285
  auto last_pos = str.find_last_of(':');
235
286
  if (last_pos == std::string::npos) {
@@ -144,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
144
144
  for (int i = 0; i < np; i += ggml_f16_step) {
145
145
  ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
146
146
 
147
- ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
147
+ ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
148
148
  sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
149
149
  ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
150
150
  sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
151
151
 
152
152
  ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
153
153
 
154
- ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
154
+ ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
155
155
  sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
156
156
  ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
157
157
  sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
@@ -160,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
160
160
 
161
161
  ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
162
162
  sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
163
- ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163
+ ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
164
164
  sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
165
165
 
166
166
  ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
@@ -820,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f
820
820
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
821
821
  inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
822
822
  for (int i = 0; i < n; ++i) {
823
- y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
823
+ const float v = GGML_CPU_FP16_TO_FP32(x[i]);
824
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
824
825
  }
825
826
  }
826
827
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
@@ -140,7 +140,11 @@ uint32_t llama_hparams::n_embd_s() const {
140
140
  }
141
141
 
142
142
  bool llama_hparams::is_recurrent(uint32_t il) const {
143
- return recurrent_layer_arr[il];
143
+ if (il < n_layer) {
144
+ return recurrent_layer_arr[il];
145
+ }
146
+
147
+ GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
144
148
  }
145
149
 
146
150
  uint32_t llama_hparams::n_pos_per_embd() const {
@@ -16313,10 +16313,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
16313
16313
  }
16314
16314
 
16315
16315
  ggml_tensor * build_layer_ffn(
16316
- ggml_tensor * cur,
16317
- ggml_tensor * inpSA,
16318
- const llama_model & model,
16319
- const int il) {
16316
+ ggml_tensor * cur,
16317
+ ggml_tensor * inpSA,
16318
+ const llama_model & model,
16319
+ const int il) {
16320
16320
 
16321
16321
  // For Granite architectures - scale residual
16322
16322
  if (hparams.f_residual_scale) {