@fugood/llama.node 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.2.4",
4
+ "version": "1.2.6",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.2.4",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.2.4",
77
- "@fugood/node-llama-linux-x64-cuda": "1.2.4",
78
- "@fugood/node-llama-linux-arm64": "1.2.4",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.2.4",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.2.4",
81
- "@fugood/node-llama-win32-x64": "1.2.4",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.2.4",
83
- "@fugood/node-llama-win32-x64-cuda": "1.2.4",
84
- "@fugood/node-llama-win32-arm64": "1.2.4",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.2.4",
86
- "@fugood/node-llama-darwin-x64": "1.2.4",
87
- "@fugood/node-llama-darwin-arm64": "1.2.4"
75
+ "@fugood/node-llama-linux-x64": "1.2.6",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.2.6",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.2.6",
78
+ "@fugood/node-llama-linux-arm64": "1.2.6",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.2.6",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.2.6",
81
+ "@fugood/node-llama-win32-x64": "1.2.6",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.2.6",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.2.6",
84
+ "@fugood/node-llama-win32-arm64": "1.2.6",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.2.6",
86
+ "@fugood/node-llama-darwin-x64": "1.2.6",
87
+ "@fugood/node-llama-darwin-arm64": "1.2.6"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -3358,7 +3358,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3358
3358
  add_opt(common_arg(
3359
3359
  {"--chat-template-kwargs"}, "STRING",
3360
3360
  string_format("sets additional params for the json template parser"),
3361
- [](common_params & params, const std::string & value) {
3361
+ [](common_params & params, const std::string & value) {
3362
3362
  auto parsed = json::parse(value);
3363
3363
  for (const auto & item : parsed.items()) {
3364
3364
  params.default_template_kwargs[item.key()] = item.value().dump();
@@ -3570,21 +3570,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3570
3570
  common_log_set_file(common_log_main(), value.c_str());
3571
3571
  }
3572
3572
  ));
3573
- add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3574
- "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3575
- "'auto' enables colors when output is to a terminal",
3576
- [](common_params &, const std::string & value) {
3577
- if (is_truthy(value)) {
3578
- common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3579
- } else if (is_falsey(value)) {
3580
- common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3581
- } else if (is_autoy(value)) {
3582
- common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3583
- } else {
3584
- throw std::invalid_argument(
3585
- string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3586
- }
3587
- }).set_env("LLAMA_LOG_COLORS"));
3573
+ add_opt(common_arg(
3574
+ {"--log-colors"}, "[on|off|auto]",
3575
+ "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3576
+ "'auto' enables colors when output is to a terminal",
3577
+ [](common_params &, const std::string & value) {
3578
+ if (is_truthy(value)) {
3579
+ common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3580
+ } else if (is_falsey(value)) {
3581
+ common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3582
+ } else if (is_autoy(value)) {
3583
+ common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3584
+ } else {
3585
+ throw std::invalid_argument(
3586
+ string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3587
+ }
3588
+ }
3589
+ ).set_env("LLAMA_LOG_COLORS"));
3588
3590
  add_opt(common_arg(
3589
3591
  {"-v", "--verbose", "--log-verbose"},
3590
3592
  "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3850,7 +3852,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3850
3852
  }
3851
3853
  ).set_examples({LLAMA_EXAMPLE_TTS}));
3852
3854
 
3853
- // model-specific
3855
+ add_opt(common_arg(
3856
+ {"--diffusion-steps"}, "N",
3857
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3858
+ [](common_params & params, int value) { params.diffusion.steps = value; }
3859
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3860
+ add_opt(common_arg(
3861
+ {"--diffusion-visual"},
3862
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
3863
+ [](common_params & params) { params.diffusion.visual_mode = true; }
3864
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3865
+ add_opt(common_arg(
3866
+ {"--diffusion-eps"}, "F",
3867
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3868
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3869
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3870
+ add_opt(common_arg(
3871
+ {"--diffusion-algorithm"}, "N",
3872
+ string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
3873
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
3874
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3875
+ add_opt(common_arg(
3876
+ {"--diffusion-alg-temp"}, "F",
3877
+ string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3878
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3879
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3880
+ add_opt(common_arg(
3881
+ {"--diffusion-block-length"}, "N",
3882
+ string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3883
+ [](common_params & params, int value) { params.diffusion.block_length = value; }
3884
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3885
+ add_opt(common_arg(
3886
+ {"--diffusion-cfg-scale"}, "F",
3887
+ string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3888
+ [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3889
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3890
+ add_opt(common_arg(
3891
+ {"--diffusion-add-gumbel-noise"}, "F",
3892
+ string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3893
+ [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3894
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3895
+ add_opt(common_arg(
3896
+ { "-lr", "--learning-rate" }, "ALPHA",
3897
+ string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
3898
+ [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
3899
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3900
+ add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3901
+ string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3902
+ (double) params.lr.lr_min),
3903
+ [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
3904
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3905
+ add_opt(common_arg(
3906
+ {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
3907
+ string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
3908
+ [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
3909
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3910
+ add_opt(common_arg(
3911
+ {"-wd", "--weight-decay"}, "WD",
3912
+ string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
3913
+ [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
3914
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3915
+ add_opt(common_arg(
3916
+ {"-val-split", "--val-split"}, "FRACTION",
3917
+ string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
3918
+ [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
3919
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3920
+ add_opt(common_arg(
3921
+ {"-epochs", "--epochs"}, "N",
3922
+ string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3923
+ [](common_params & params, int epochs) { params.lr.epochs = epochs; }
3924
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3925
+ add_opt(common_arg(
3926
+ {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
3927
+ [](common_params & params, const std::string & name) {
3928
+ params.optimizer = common_opt_get_optimizer(name.c_str());
3929
+ if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3930
+ throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3931
+ }
3932
+ }
3933
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3934
+
3935
+ // presets
3854
3936
  add_opt(common_arg(
3855
3937
  {"--tts-oute-default"},
3856
3938
  string_format("use default OuteTTS models (note: can download weights from the internet)"),
@@ -3863,39 +3945,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3863
3945
  ).set_examples({LLAMA_EXAMPLE_TTS}));
3864
3946
 
3865
3947
  add_opt(common_arg(
3866
- {"--embd-bge-small-en-default"},
3867
- string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
3868
- [](common_params & params) {
3869
- params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3870
- params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3871
- params.embd_normalize = 2;
3872
- params.n_ctx = 512;
3873
- params.verbose_prompt = true;
3874
- params.embedding = true;
3875
- }
3876
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3877
-
3878
- add_opt(common_arg(
3879
- {"--embd-e5-small-en-default"},
3880
- string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
3881
- [](common_params & params) {
3882
- params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3883
- params.model.hf_file = "e5-small-v2-q8_0.gguf";
3884
- params.embd_normalize = 2;
3885
- params.n_ctx = 512;
3886
- params.verbose_prompt = true;
3887
- params.embedding = true;
3888
- }
3889
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3890
-
3891
- add_opt(common_arg(
3892
- {"--embd-gte-small-default"},
3893
- string_format("use default gte-small model (note: can download weights from the internet)"),
3948
+ {"--embd-gemma-default"},
3949
+ string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
3894
3950
  [](common_params & params) {
3895
- params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3896
- params.model.hf_file = "gte-small-q8_0.gguf";
3897
- params.embd_normalize = 2;
3898
- params.n_ctx = 512;
3951
+ params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
3952
+ params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
3953
+ params.port = 8011;
3954
+ params.n_ubatch = 2048;
3955
+ params.n_batch = 2048;
3956
+ params.n_parallel = 32;
3957
+ params.n_ctx = 2048*params.n_parallel;
3899
3958
  params.verbose_prompt = true;
3900
3959
  params.embedding = true;
3901
3960
  }
@@ -3990,96 +4049,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3990
4049
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
3991
4050
 
3992
4051
  add_opt(common_arg(
3993
- { "--diffusion-steps" }, "N",
3994
- string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3995
- [](common_params & params, int value) { params.diffusion.steps = value; }
3996
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3997
- add_opt(common_arg(
3998
- { "--diffusion-visual" },
3999
- string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
4000
- params.diffusion.visual_mode ? "true" : "false"),
4001
- [](common_params & params) { params.diffusion.visual_mode = true; }
4002
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4052
+ {"--gpt-oss-20b-default"},
4053
+ string_format("use gpt-oss-20b (note: can download weights from the internet)"),
4054
+ [](common_params & params) {
4055
+ params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
4056
+ params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
4057
+ params.port = 8013;
4058
+ params.n_ubatch = 2048;
4059
+ params.n_batch = 32768;
4060
+ params.n_parallel = 2;
4061
+ params.n_ctx = 131072*params.n_parallel;
4062
+ params.sampling.temp = 1.0f;
4063
+ params.sampling.top_p = 1.0f;
4064
+ params.sampling.top_k = 0;
4065
+ params.sampling.min_p = 0.01f;
4066
+ params.use_jinja = true;
4067
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4068
+ }
4069
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
4003
4070
 
4004
4071
  add_opt(common_arg(
4005
- { "--diffusion-eps" }, "F",
4006
- string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
4007
- [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
4008
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4009
- add_opt(common_arg(
4010
- { "--diffusion-algorithm" }, "N",
4011
- string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
4012
- params.diffusion.algorithm),
4013
- [](common_params & params, int value) { params.diffusion.algorithm = value; }
4014
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4015
- add_opt(common_arg(
4016
- { "--diffusion-alg-temp" }, "F",
4017
- string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
4018
- [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
4019
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4072
+ {"--gpt-oss-120b-default"},
4073
+ string_format("use gpt-oss-120b (note: can download weights from the internet)"),
4074
+ [](common_params & params) {
4075
+ params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
4076
+ params.port = 8013;
4077
+ params.n_ubatch = 2048;
4078
+ params.n_batch = 32768;
4079
+ params.n_parallel = 2;
4080
+ params.n_ctx = 131072*params.n_parallel;
4081
+ params.sampling.temp = 1.0f;
4082
+ params.sampling.top_p = 1.0f;
4083
+ params.sampling.top_k = 0;
4084
+ params.sampling.min_p = 0.01f;
4085
+ params.use_jinja = true;
4086
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4087
+ }
4088
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
4020
4089
 
4021
4090
  add_opt(common_arg(
4022
- { "--diffusion-block-length" }, "N",
4023
- string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
4024
- [](common_params & params, int value) { params.diffusion.block_length = value; }
4025
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4026
- add_opt(common_arg(
4027
- { "--diffusion-cfg-scale" }, "F",
4028
- string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
4029
- [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
4030
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4031
- add_opt(common_arg(
4032
- { "--diffusion-add-gumbel-noise" }, "F",
4033
- string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
4034
- [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
4035
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4036
-
4091
+ {"--vision-gemma-4b-default"},
4092
+ string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
4093
+ [](common_params & params) {
4094
+ params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
4095
+ params.port = 8014;
4096
+ params.n_ctx = 0;
4097
+ params.use_jinja = true;
4098
+ }
4099
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
4037
4100
 
4038
- add_opt(
4039
- common_arg({ "-lr", "--learning-rate" }, "ALPHA",
4040
- string_format(
4041
- "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
4042
- (double) params.lr.lr0),
4043
- [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
4044
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4045
- add_opt(
4046
- common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
4047
- string_format(
4048
- "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
4049
- (double) params.lr.lr_min),
4050
- [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
4051
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4052
- add_opt(
4053
- common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
4054
- string_format(
4055
- "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
4056
- (double) params.lr.decay_epochs),
4057
- [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
4058
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4059
- add_opt(common_arg(
4060
- { "-wd", "--weight-decay" }, "WD",
4061
- string_format(
4062
- "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
4063
- (double) params.lr.wd),
4064
- [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
4065
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4066
- add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
4067
- string_format("fraction of data to use as validation set for training (default: %.2g).",
4068
- (double) params.val_split),
4069
- [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
4070
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4071
- add_opt(common_arg({ "-epochs", "--epochs" }, "N",
4072
- string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
4073
- [](common_params & params, int epochs) { params.lr.epochs = epochs; })
4074
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4075
- add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
4076
- [](common_params & params, const std::string & name) {
4077
- params.optimizer = common_opt_get_optimizer(name.c_str());
4078
- if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
4079
- throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
4080
- }
4081
- })
4082
- .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4101
+ add_opt(common_arg(
4102
+ {"--vision-gemma-12b-default"},
4103
+ string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
4104
+ [](common_params & params) {
4105
+ params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
4106
+ params.port = 8014;
4107
+ params.n_ctx = 0;
4108
+ params.use_jinja = true;
4109
+ }
4110
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
4083
4111
 
4084
4112
  return ctx_arg;
4085
4113
  }
@@ -432,7 +432,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
432
432
  if (is_arguments_path({})) {
433
433
  // Entire JSON is the arguments and was parsed fully.
434
434
  return consume_json_result {
435
- partial->json.dump(),
435
+ partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
436
436
  /* .is_partial = */ false,
437
437
  };
438
438
  }
@@ -444,7 +444,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
444
444
  std::vector<std::string> path;
445
445
  std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
446
446
  if (is_arguments_path(path)) {
447
- auto arguments = j.dump();
447
+ auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
448
448
  if (is_partial() && !partial->healing_marker.marker.empty()) {
449
449
  auto idx = arguments.find(partial->healing_marker.json_dump_marker);
450
450
  if (idx != std::string::npos) {
@@ -427,7 +427,7 @@ struct common_params {
427
427
  int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
428
428
  int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
429
429
  int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
430
- int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
430
+ int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
431
431
 
432
432
  std::string hostname = "127.0.0.1";
433
433
  std::string public_path = ""; // NOLINT
@@ -5,6 +5,7 @@
5
5
  #include <nlohmann/json.hpp>
6
6
 
7
7
  #include <string>
8
+ #include <regex>
8
9
 
9
10
  using json = nlohmann::ordered_json;
10
11
 
@@ -168,6 +169,47 @@ bool common_json_parse(
168
169
  }
169
170
  }
170
171
 
172
+ // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
173
+ static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
174
+
175
+ auto is_high_surrogate = [&](const std::string & s) {
176
+ // Check if a partial of a high surrogate (U+D800-U+DBFF)
177
+ return s.length() >= 4 &&
178
+ s[0] == '\\' && s[1] == 'u' &&
179
+ std::tolower(s[2]) == 'd' &&
180
+ (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
181
+ };
182
+
183
+ // Initialize the unicode marker to a low surrogate to handle the edge case
184
+ // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
185
+ // backslash (\)
186
+ std::string unicode_marker_padding = "udc00";
187
+ std::smatch last_unicode_seq;
188
+
189
+ if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
190
+ std::smatch second_last_seq;
191
+ std::string prelude = str.substr(0, last_unicode_seq.position());
192
+
193
+ // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
194
+ unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
195
+
196
+ if (is_high_surrogate(last_unicode_seq.str())) {
197
+ // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
198
+ unicode_marker_padding += "\\udc00";
199
+ } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
200
+ if (is_high_surrogate(second_last_seq.str())) {
201
+ // If this follows a high surrogate, pad it to be a low surrogate
202
+ if (last_unicode_seq.length() == 2) {
203
+ unicode_marker_padding = "dc00";
204
+ } else if (last_unicode_seq.length() == 3) {
205
+ unicode_marker_padding = "c00";
206
+ } else {
207
+ // The original unicode_marker_padding is already padded with 0s
208
+ }
209
+ }
210
+ }
211
+ }
212
+
171
213
  const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
172
214
 
173
215
  if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
@@ -186,6 +228,9 @@ bool common_json_parse(
186
228
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
187
229
  // Was inside an object value string after an escape
188
230
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
231
+ } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
232
+ // Was inside an object value string after a partial unicode escape
233
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
189
234
  } else {
190
235
  // find last :
191
236
  auto last_pos = str.find_last_of(':');
@@ -205,6 +250,9 @@ bool common_json_parse(
205
250
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
206
251
  // Was inside an array value string after an escape
207
252
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
253
+ } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
254
+ // Was inside an array value string after a partial unicode escape
255
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
208
256
  } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
209
257
  // Had just finished a value
210
258
  str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
@@ -230,6 +278,9 @@ bool common_json_parse(
230
278
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
231
279
  // Was inside an object key string after an escape
232
280
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
281
+ } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
282
+ // Was inside an object key string after a partial unicode escape
283
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
233
284
  } else {
234
285
  auto last_pos = str.find_last_of(':');
235
286
  if (last_pos == std::string::npos) {
@@ -68,7 +68,7 @@ struct ggml_compute_params {
68
68
  #endif // __VXE2__
69
69
  #endif // __s390x__ && __VEC__
70
70
 
71
- #if defined(__ARM_FEATURE_SVE)
71
+ #if defined(__ARM_FEATURE_SVE) && defined(__linux__)
72
72
  #include <sys/prctl.h>
73
73
  #endif
74
74
 
@@ -689,8 +689,13 @@ bool ggml_is_numa(void) {
689
689
  #endif
690
690
 
691
691
  static void ggml_init_arm_arch_features(void) {
692
- #if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
692
+ #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
693
+ #if defined(__linux__)
693
694
  ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
695
+ #else
696
+ // TODO: add support of SVE for non-linux systems
697
+ #error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
698
+ #endif
694
699
  #endif
695
700
  }
696
701
 
@@ -463,9 +463,9 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa
463
463
  #endif
464
464
  for (; i < n; ++i) {
465
465
  float val = x[i] - mean;
466
+ y[i] = val;
466
467
  val *= val;
467
468
  sum += (ggml_float)val;
468
- y[i] = val;
469
469
  }
470
470
  return sum/n;
471
471
  }
@@ -144,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
144
144
  for (int i = 0; i < np; i += ggml_f16_step) {
145
145
  ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
146
146
 
147
- ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
147
+ ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
148
148
  sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
149
149
  ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
150
150
  sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
151
151
 
152
152
  ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
153
153
 
154
- ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
154
+ ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
155
155
  sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
156
156
  ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
157
157
  sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
@@ -160,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
160
160
 
161
161
  ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
162
162
  sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
163
- ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163
+ ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
164
164
  sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
165
165
 
166
166
  ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
@@ -820,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f
820
820
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
821
821
  inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
822
822
  for (int i = 0; i < n; ++i) {
823
- y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
823
+ const float v = GGML_CPU_FP16_TO_FP32(x[i]);
824
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
824
825
  }
825
826
  }
826
827
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
261
261
  }
262
262
  }
263
263
 
264
- static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
264
+ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
265
265
  LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
266
- const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
267
- (swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
268
- (swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
269
- (swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
266
+ const char * swa_type_str = "unknown";
267
+
268
+ switch (swa_type) {
269
+ case LLAMA_SWA_TYPE_NONE: swa_type_str = "LLAMA_SWA_TYPE_NONE"; break;
270
+ case LLAMA_SWA_TYPE_STANDARD: swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
271
+ case LLAMA_SWA_TYPE_CHUNKED: swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
272
+ case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
273
+ };
274
+
270
275
  LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
271
276
  LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
272
277
  LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
@@ -295,50 +300,67 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
295
300
  const int64_t n_kv = ubatch->n_tokens;
296
301
  const int64_t n_tokens = ubatch->n_tokens;
297
302
 
298
- GGML_ASSERT(kq_mask);
299
- GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
300
-
301
- float * data = (float *) kq_mask->data;
302
-
303
- // [TAG_NO_CACHE_ISWA]
304
- GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
303
+ const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
304
+ for (int h = 0; h < 1; ++h) {
305
+ for (int i1 = 0; i1 < n_tokens; ++i1) {
306
+ const llama_seq_id s1 = ubatch->seq_id[i1][0];
307
+ const llama_pos p1 = ubatch->pos[i1];
305
308
 
306
- for (int h = 0; h < 1; ++h) {
307
- for (int i1 = 0; i1 < n_tokens; ++i1) {
308
- const llama_seq_id s1 = ubatch->seq_id[i1][0];
309
+ const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
309
310
 
310
- for (int i0 = 0; i0 < n_tokens; ++i0) {
311
- float f = -INFINITY;
312
-
313
- for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
311
+ for (int i0 = 0; i0 < n_tokens; ++i0) {
314
312
  const llama_seq_id s0 = ubatch->seq_id[i0][0];
313
+ const llama_pos p0 = ubatch->pos[i0];
315
314
 
315
+ // mask different sequences
316
316
  if (s0 != s1) {
317
- continue; // skip different sequences
317
+ continue;
318
318
  }
319
319
 
320
- if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
321
- continue; // skip future tokens for causal attention
320
+ // mask future tokens
321
+ if (cparams.causal_attn && p0 > p1) {
322
+ continue;
322
323
  }
323
324
 
324
- // TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
325
- //if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
326
- // continue; // skip masked tokens for SWA
327
- //}
328
-
329
- // TODO: reimplement this like in llama_kv_cache_unified
330
- if (hparams.use_alibi) {
331
- f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
332
- } else {
333
- f = 0.0f;
325
+ // apply SWA if any
326
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
327
+ continue;
334
328
  }
329
+
330
+ data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
335
331
  }
336
- data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
337
332
  }
338
333
  }
334
+ };
335
+
336
+ {
337
+ GGML_ASSERT(self_kq_mask);
338
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
339
+
340
+ float * data = (float *) self_kq_mask->data;
341
+
342
+ std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
343
+
344
+ fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
345
+
346
+ if (debug) {
347
+ print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
348
+ }
339
349
  }
340
- if (debug) {
341
- print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
350
+
351
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
352
+ GGML_ASSERT(self_kq_mask_swa);
353
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
354
+
355
+ float * data = (float *) self_kq_mask_swa->data;
356
+
357
+ std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
358
+
359
+ fill_mask(data, hparams.n_swa, hparams.swa_type);
360
+
361
+ if (debug) {
362
+ print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
363
+ }
342
364
  }
343
365
  }
344
366
 
@@ -1299,12 +1321,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1299
1321
  k = ggml_permute(ctx0, k, 0, 2, 1, 3);
1300
1322
  v = ggml_permute(ctx0, v, 0, 2, 1, 3);
1301
1323
 
1302
- const auto n_kv = k->ne[1];
1303
-
1304
1324
  ggml_tensor * cur;
1305
1325
 
1306
- // TODO: replace hardcoded padding with ggml-provided padding
1307
- if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
1326
+ if (cparams.flash_attn && kq_b == nullptr) {
1308
1327
  GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
1309
1328
 
1310
1329
  if (v_trans) {
@@ -1419,10 +1438,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
1419
1438
  auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
1420
1439
 
1421
1440
  // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
1422
- inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1423
- ggml_set_input(inp->kq_mask);
1441
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1442
+ ggml_set_input(inp->self_kq_mask);
1443
+
1444
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1424
1445
 
1425
- inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
1446
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
1447
+ inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1448
+ ggml_set_input(inp->self_kq_mask_swa);
1449
+
1450
+ inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
1451
+ } else {
1452
+ inp->self_kq_mask_swa = nullptr;
1453
+ inp->self_kq_mask_swa_cnv = nullptr;
1454
+ }
1426
1455
 
1427
1456
  return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
1428
1457
  }
@@ -1447,7 +1476,9 @@ ggml_tensor * llm_graph_context::build_attn(
1447
1476
  ggml_build_forward_expand(gf, k_cur);
1448
1477
  ggml_build_forward_expand(gf, v_cur);
1449
1478
 
1450
- const auto & kq_mask = inp->get_kq_mask();
1479
+ const bool is_swa = hparams.is_swa(il);
1480
+
1481
+ const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
1451
1482
 
1452
1483
  // [TAG_NO_CACHE_PAD]
1453
1484
  // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
@@ -257,10 +257,14 @@ public:
257
257
 
258
258
  void set_input(const llama_ubatch * ubatch) override;
259
259
 
260
- ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
260
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
261
+ ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
261
262
 
262
- ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
263
- ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
263
+ // n_tokens == n_batch
264
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
265
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
266
+ ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
267
+ ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
264
268
 
265
269
  const llama_hparams hparams;
266
270
  const llama_cparams cparams;
@@ -140,7 +140,11 @@ uint32_t llama_hparams::n_embd_s() const {
140
140
  }
141
141
 
142
142
  bool llama_hparams::is_recurrent(uint32_t il) const {
143
- return recurrent_layer_arr[il];
143
+ if (il < n_layer) {
144
+ return recurrent_layer_arr[il];
145
+ }
146
+
147
+ GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
144
148
  }
145
149
 
146
150
  uint32_t llama_hparams::n_pos_per_embd() const {
@@ -11358,8 +11358,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
11358
11358
  }
11359
11359
  };
11360
11360
 
11361
- struct llm_build_gemma_embedding_iswa : public llm_graph_context {
11362
- llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11361
+ struct llm_build_gemma_embedding : public llm_graph_context {
11362
+ llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11363
11363
  const int64_t n_embd_head = hparams.n_embd_head_k;
11364
11364
 
11365
11365
  ggml_tensor * cur;
@@ -11376,8 +11376,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
11376
11376
  // inp_pos - contains the positions
11377
11377
  ggml_tensor * inp_pos = build_inp_pos();
11378
11378
 
11379
- // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
11380
- auto * inp_attn = build_attn_inp_kv_iswa();
11379
+ auto * inp_attn = build_attn_inp_no_cache();
11381
11380
 
11382
11381
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11383
11382
 
@@ -16313,10 +16312,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
16313
16312
  }
16314
16313
 
16315
16314
  ggml_tensor * build_layer_ffn(
16316
- ggml_tensor * cur,
16317
- ggml_tensor * inpSA,
16318
- const llama_model & model,
16319
- const int il) {
16315
+ ggml_tensor * cur,
16316
+ ggml_tensor * inpSA,
16317
+ const llama_model & model,
16318
+ const int il) {
16320
16319
 
16321
16320
  // For Granite architectures - scale residual
16322
16321
  if (hparams.f_residual_scale) {
@@ -19378,7 +19377,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19378
19377
  case LLM_ARCH_NOMIC_BERT_MOE:
19379
19378
  case LLM_ARCH_NEO_BERT:
19380
19379
  case LLM_ARCH_WAVTOKENIZER_DEC:
19381
- //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
19380
+ case LLM_ARCH_GEMMA_EMBEDDING:
19382
19381
  case LLM_ARCH_DREAM:
19383
19382
  case LLM_ARCH_LLADA:
19384
19383
  case LLM_ARCH_LLADA_MOE:
@@ -19671,7 +19670,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19671
19670
  } break;
19672
19671
  case LLM_ARCH_GEMMA_EMBEDDING:
19673
19672
  {
19674
- llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
19673
+ llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
19675
19674
  } break;
19676
19675
  case LLM_ARCH_STARCODER2:
19677
19676
  {
@@ -312,6 +312,7 @@ struct llama_model * llama_model_load_from_splits(
312
312
  LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
313
313
  return nullptr;
314
314
  }
315
+ splits.reserve(n_paths);
315
316
  for (size_t i = 0; i < n_paths; ++i) {
316
317
  splits.push_back(paths[i]);
317
318
  }