@fugood/llama.node 1.4.8 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.8",
4
+ "version": "1.4.9",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.8",
76
- "@fugood/node-llama-darwin-x64": "1.4.8",
77
- "@fugood/node-llama-linux-arm64": "1.4.8",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.8",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.8",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.8",
81
- "@fugood/node-llama-linux-x64": "1.4.8",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.8",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.8",
84
- "@fugood/node-llama-win32-arm64": "1.4.8",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.8",
86
- "@fugood/node-llama-win32-x64": "1.4.8",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.8",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.8"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.9",
76
+ "@fugood/node-llama-darwin-x64": "1.4.9",
77
+ "@fugood/node-llama-linux-arm64": "1.4.9",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.9",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.9",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.9",
81
+ "@fugood/node-llama-linux-x64": "1.4.9",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.9",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.9",
84
+ "@fugood/node-llama-win32-arm64": "1.4.9",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.9",
86
+ "@fugood/node-llama-win32-x64": "1.4.9",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.9",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.9"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -98,7 +98,7 @@ index 6085510a4..263076ce2 100644
98
98
  struct common_chat_tool_call {
99
99
  std::string name;
100
100
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
101
- index 5a8cf5248..8010a990e 100644
101
+ index d4e8c7405..af3dec813 100644
102
102
  --- a/src/llama.cpp/common/common.cpp
103
103
  +++ b/src/llama.cpp/common/common.cpp
104
104
  @@ -1343,6 +1343,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
@@ -110,7 +110,7 @@ index 5a8cf5248..8010a990e 100644
110
110
  mparams.split_mode = params.split_mode;
111
111
  mparams.tensor_split = params.tensor_split;
112
112
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
113
- index d70744840..dea8c4546 100644
113
+ index 3e314f4c8..5750a4057 100644
114
114
  --- a/src/llama.cpp/common/common.h
115
115
  +++ b/src/llama.cpp/common/common.h
116
116
  @@ -307,6 +307,7 @@ struct lr_opt {
@@ -122,7 +122,7 @@ index d70744840..dea8c4546 100644
122
122
  int32_t n_ctx = 0; // context size, 0 == context the model was trained with
123
123
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
124
124
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
125
- index fc31089f3..aa9befe4c 100644
125
+ index 28fb7612e..63f7e1ca1 100644
126
126
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
127
127
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
128
128
  @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -135,10 +135,10 @@ index fc31089f3..aa9befe4c 100644
135
135
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
136
136
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
137
137
  diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
138
- index 514f086f6..792abaa58 100644
138
+ index 6a00abacc..9e12459b6 100644
139
139
  --- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
140
140
  +++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
141
- @@ -3213,11 +3213,26 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
141
+ @@ -3226,11 +3226,26 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
142
142
  GGML_UNUSED(dev);
143
143
  }
144
144
 
@@ -168,7 +168,7 @@ index 514f086f6..792abaa58 100644
168
168
  GGML_UNUSED(dev);
169
169
  }
170
170
 
171
- @@ -3398,10 +3413,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
171
+ @@ -3413,10 +3428,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
172
172
  }
173
173
  }
174
174
 
@@ -187,7 +187,7 @@ index 514f086f6..792abaa58 100644
187
187
 
188
188
  GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
189
189
 
190
- @@ -3414,6 +3436,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
190
+ @@ -3429,6 +3451,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
191
191
  } catch (std::exception const &exc) {
192
192
  GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
193
193
  devices[i].context = nullptr;
@@ -250,6 +250,8 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
250
250
  }
251
251
 
252
252
  common_params params;
253
+ params.fit_params = false;
254
+
253
255
  params.model.path = get_option<std::string>(options, "model", "");
254
256
  if (params.model.path.empty()) {
255
257
  Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
@@ -420,6 +420,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
420
420
  }
421
421
  };
422
422
 
423
+ std::set<std::string> seen_args;
424
+
423
425
  for (int i = 1; i < argc; i++) {
424
426
  const std::string arg_prefix = "--";
425
427
 
@@ -430,6 +432,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
430
432
  if (arg_to_options.find(arg) == arg_to_options.end()) {
431
433
  throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
432
434
  }
435
+ if (!seen_args.insert(arg).second) {
436
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
437
+ }
433
438
  auto & tmp = arg_to_options[arg];
434
439
  auto opt = *tmp.first;
435
440
  bool is_positive = tmp.second;
@@ -750,6 +755,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
750
755
  }
751
756
  };
752
757
 
758
+ std::set<std::string> seen_args;
759
+
753
760
  for (int i = 1; i < argc; i++) {
754
761
  const std::string arg_prefix = "--";
755
762
 
@@ -760,6 +767,9 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
760
767
  if (arg_to_options.find(arg) == arg_to_options.end()) {
761
768
  throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
762
769
  }
770
+ if (!seen_args.insert(arg).second) {
771
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
772
+ }
763
773
  auto opt = *arg_to_options[arg];
764
774
  std::string val;
765
775
  if (opt.value_hint != nullptr) {
@@ -863,7 +873,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
863
873
  sampler_type_chars += common_sampler_type_to_chr(sampler);
864
874
  sampler_type_names += common_sampler_type_to_str(sampler) + ";";
865
875
  }
866
- sampler_type_names.pop_back();
876
+ if (!sampler_type_names.empty()) {
877
+ sampler_type_names.pop_back(); // remove last semicolon
878
+ }
867
879
 
868
880
 
869
881
  /**
@@ -1184,7 +1196,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1184
1196
  [](common_params & params, const std::string & value) {
1185
1197
  params.system_prompt = value;
1186
1198
  }
1187
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1199
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
1188
1200
  add_opt(common_arg(
1189
1201
  {"--perf"},
1190
1202
  {"--no-perf"},
@@ -1226,13 +1238,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1226
1238
  ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1227
1239
  add_opt(common_arg(
1228
1240
  {"--in-file"}, "FNAME",
1229
- "an input file (repeat to specify multiple files)",
1241
+ "an input file (use comma-separated values to specify multiple files)",
1230
1242
  [](common_params & params, const std::string & value) {
1231
- std::ifstream file(value);
1232
- if (!file) {
1233
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1243
+ for (const auto & item : string_split<std::string>(value, ',')) {
1244
+ std::ifstream file(item);
1245
+ if (!file) {
1246
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
1247
+ }
1248
+ params.in_files.push_back(item);
1234
1249
  }
1235
- params.in_files.push_back(value);
1236
1250
  }
1237
1251
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1238
1252
  add_opt(common_arg(
@@ -1969,9 +1983,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1969
1983
  ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
1970
1984
  add_opt(common_arg(
1971
1985
  {"--image", "--audio"}, "FILE",
1972
- "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
1986
+ "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
1973
1987
  [](common_params & params, const std::string & value) {
1974
- params.image.emplace_back(value);
1988
+ for (const auto & item : string_split<std::string>(value, ',')) {
1989
+ params.image.emplace_back(item);
1990
+ }
1975
1991
  }
1976
1992
  ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
1977
1993
  add_opt(common_arg(
@@ -2218,12 +2234,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2218
2234
  }
2219
2235
  ));
2220
2236
  add_opt(common_arg(
2221
- {"--override-kv"}, "KEY=TYPE:VALUE",
2222
- "advanced option to override model metadata by key. may be specified multiple times.\n"
2223
- "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
2237
+ {"--override-kv"}, "KEY=TYPE:VALUE,...",
2238
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
2239
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
2224
2240
  [](common_params & params, const std::string & value) {
2225
- if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
2226
- throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
2241
+ std::vector<std::string> kv_overrides;
2242
+
2243
+ std::string current;
2244
+ bool escaping = false;
2245
+
2246
+ for (const char c : value) {
2247
+ if (escaping) {
2248
+ current.push_back(c);
2249
+ escaping = false;
2250
+ } else if (c == '\\') {
2251
+ escaping = true;
2252
+ } else if (c == ',') {
2253
+ kv_overrides.push_back(current);
2254
+ current.clear();
2255
+ } else {
2256
+ current.push_back(c);
2257
+ }
2258
+ }
2259
+
2260
+ if (escaping) {
2261
+ current.push_back('\\');
2262
+ }
2263
+
2264
+ kv_overrides.push_back(current);
2265
+
2266
+ for (const auto & kv_override : kv_overrides) {
2267
+ if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
2268
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
2269
+ }
2227
2270
  }
2228
2271
  }
2229
2272
  ));
@@ -2237,33 +2280,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2237
2280
  ));
2238
2281
  add_opt(common_arg(
2239
2282
  {"--lora"}, "FNAME",
2240
- "path to LoRA adapter (can be repeated to use multiple adapters)",
2283
+ "path to LoRA adapter (use comma-separated values to load multiple adapters)",
2241
2284
  [](common_params & params, const std::string & value) {
2242
- params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
2285
+ for (const auto & item : string_split<std::string>(value, ',')) {
2286
+ params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
2287
+ }
2243
2288
  }
2244
2289
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2245
2290
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2246
2291
  add_opt(common_arg(
2247
- {"--lora-scaled"}, "FNAME", "SCALE",
2248
- "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
2249
- [](common_params & params, const std::string & fname, const std::string & scale) {
2250
- params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
2292
+ {"--lora-scaled"}, "FNAME:SCALE,...",
2293
+ "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
2294
+ "note: use comma-separated values",
2295
+ [](common_params & params, const std::string & value) {
2296
+ for (const auto & item : string_split<std::string>(value, ',')) {
2297
+ auto parts = string_split<std::string>(item, ':');
2298
+ if (parts.size() != 2) {
2299
+ throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
2300
+ }
2301
+ params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
2302
+ }
2251
2303
  }
2252
2304
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2253
2305
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2254
2306
  add_opt(common_arg(
2255
2307
  {"--control-vector"}, "FNAME",
2256
- "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
2308
+ "add a control vector\nnote: use comma-separated values to add multiple control vectors",
2257
2309
  [](common_params & params, const std::string & value) {
2258
- params.control_vectors.push_back({ 1.0f, value, });
2310
+ for (const auto & item : string_split<std::string>(value, ',')) {
2311
+ params.control_vectors.push_back({ 1.0f, item, });
2312
+ }
2259
2313
  }
2260
2314
  ));
2261
2315
  add_opt(common_arg(
2262
- {"--control-vector-scaled"}, "FNAME", "SCALE",
2316
+ {"--control-vector-scaled"}, "FNAME:SCALE,...",
2263
2317
  "add a control vector with user defined scaling SCALE\n"
2264
- "note: this argument can be repeated to add multiple scaled control vectors",
2265
- [](common_params & params, const std::string & fname, const std::string & scale) {
2266
- params.control_vectors.push_back({ std::stof(scale), fname });
2318
+ "note: use comma-separated values (format: FNAME:SCALE,...)",
2319
+ [](common_params & params, const std::string & value) {
2320
+ for (const auto & item : string_split<std::string>(value, ',')) {
2321
+ auto parts = string_split<std::string>(item, ':');
2322
+ if (parts.size() != 2) {
2323
+ throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
2324
+ }
2325
+ params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
2326
+ }
2267
2327
  }
2268
2328
  ));
2269
2329
  add_opt(common_arg(
@@ -2353,13 +2413,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2353
2413
  ).set_env("HF_TOKEN"));
2354
2414
  add_opt(common_arg(
2355
2415
  {"--context-file"}, "FNAME",
2356
- "file to load context from (repeat to specify multiple files)",
2416
+ "file to load context from (use comma-separated values to specify multiple files)",
2357
2417
  [](common_params & params, const std::string & value) {
2358
- std::ifstream file(value, std::ios::binary);
2359
- if (!file) {
2360
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
2418
+ for (const auto & item : string_split<std::string>(value, ',')) {
2419
+ std::ifstream file(item, std::ios::binary);
2420
+ if (!file) {
2421
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
2422
+ }
2423
+ params.context_files.push_back(item);
2361
2424
  }
2362
- params.context_files.push_back(value);
2363
2425
  }
2364
2426
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2365
2427
  add_opt(common_arg(
@@ -2550,6 +2612,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2550
2612
  params.api_prefix = value;
2551
2613
  }
2552
2614
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2615
+ add_opt(common_arg(
2616
+ {"--webui-config"}, "JSON",
2617
+ "JSON that provides default WebUI settings (overrides WebUI defaults)",
2618
+ [](common_params & params, const std::string & value) {
2619
+ params.webui_config_json = value;
2620
+ }
2621
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
2622
+ add_opt(common_arg(
2623
+ {"--webui-config-file"}, "PATH",
2624
+ "JSON file that provides default WebUI settings (overrides WebUI defaults)",
2625
+ [](common_params & params, const std::string & value) {
2626
+ params.webui_config_json = read_file(value);
2627
+ }
2628
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
2553
2629
  add_opt(common_arg(
2554
2630
  {"--webui"},
2555
2631
  {"--no-webui"},
@@ -1092,7 +1092,7 @@ common_init_result::common_init_result(common_params & params) :
1092
1092
  auto cparams = common_context_params_to_llama(params);
1093
1093
 
1094
1094
  if (params.fit_params) {
1095
- LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
1095
+ LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
1096
1096
  llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
1097
1097
  params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
1098
1098
  params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
@@ -485,8 +485,11 @@ struct common_params {
485
485
 
486
486
  std::map<std::string, std::string> default_template_kwargs;
487
487
 
488
+ // webui configs
489
+ bool webui = true;
490
+ std::string webui_config_json;
491
+
488
492
  // "advanced" endpoints are disabled by default for better security
489
- bool webui = true;
490
493
  bool endpoint_slots = true;
491
494
  bool endpoint_props = false; // only control POST requests, not GET
492
495
  bool endpoint_metrics = false;
@@ -104,10 +104,9 @@ struct ring_buffer {
104
104
  struct common_sampler {
105
105
  common_params_sampling params;
106
106
 
107
+ struct llama_sampler * grmr;
107
108
  struct llama_sampler * chain;
108
109
 
109
- bool grammar;
110
-
111
110
  ring_buffer<llama_token> prev;
112
111
 
113
112
  std::vector<llama_token_data> cur;
@@ -167,15 +166,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
167
166
 
168
167
  lparams.no_perf = params.no_perf;
169
168
 
169
+ llama_sampler * grmr = nullptr;
170
170
  llama_sampler * chain = llama_sampler_chain_init(lparams);
171
171
 
172
- bool grammar = false;
173
172
  std::vector<llama_sampler *> samplers;
174
173
 
175
174
  if (params.grammar.compare(0, 11, "%llguidance") == 0) {
176
175
  #ifdef LLAMA_USE_LLGUIDANCE
177
- samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
178
- grammar = true;
176
+ grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
179
177
  #else
180
178
  GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
181
179
  #endif // LLAMA_USE_LLGUIDANCE
@@ -224,15 +222,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
224
222
 
225
223
  if (!params.grammar.empty()) {
226
224
  if (params.grammar_lazy) {
227
- samplers.push_back(
228
- llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
229
- trigger_patterns_c.data(), trigger_patterns_c.size(),
230
- trigger_tokens.data(), trigger_tokens.size()));
225
+ grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
226
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
227
+ trigger_tokens.data(), trigger_tokens.size());
231
228
  } else {
232
- samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
229
+ grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
233
230
  }
234
-
235
- grammar = true;
236
231
  }
237
232
  }
238
233
 
@@ -303,8 +298,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
303
298
 
304
299
  auto * result = new common_sampler {
305
300
  /* .params = */ params,
301
+ /* .grmr = */ grmr,
306
302
  /* .chain = */ chain,
307
- /* .grammar = */ grammar,
308
303
  /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
309
304
  /* .cur = */ {},
310
305
  /* .cur_p = */ {},
@@ -315,6 +310,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
315
310
 
316
311
  void common_sampler_free(struct common_sampler * gsmpl) {
317
312
  if (gsmpl) {
313
+ llama_sampler_free(gsmpl->grmr);
318
314
  llama_sampler_free(gsmpl->chain);
319
315
 
320
316
  delete gsmpl;
@@ -324,25 +320,12 @@ void common_sampler_free(struct common_sampler * gsmpl) {
324
320
  void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
325
321
  const auto tm = gsmpl->tm();
326
322
 
327
- if (gsmpl->grammar) {
328
- const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
329
-
330
- for (int i = 0; i < n_smpl; i++) {
331
- auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
332
-
333
- // the grammar sampler is always the first one
334
- if (i == 0) {
335
- if (accept_grammar) {
336
- llama_sampler_accept(smpl, token);
337
- }
338
- } else {
339
- llama_sampler_accept(smpl, token);
340
- }
341
- }
342
- } else {
343
- llama_sampler_accept(gsmpl->chain, token);
323
+ if (gsmpl->grmr && accept_grammar) {
324
+ llama_sampler_accept(gsmpl->grmr, token);
344
325
  }
345
326
 
327
+ llama_sampler_accept(gsmpl->chain, token);
328
+
346
329
  gsmpl->prev.push_back(token);
347
330
  }
348
331
 
@@ -353,8 +336,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
353
336
  struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
354
337
  return new common_sampler {
355
338
  /* .params = */ gsmpl->params,
339
+ /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
356
340
  /* .chain = */ llama_sampler_clone(gsmpl->chain),
357
- /* .grammar = */ gsmpl->grammar,
358
341
  /* .prev = */ gsmpl->prev,
359
342
  /* .cur = */ gsmpl->cur,
360
343
  /* .cur_p = */ gsmpl->cur_p,
@@ -410,7 +393,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
410
393
  return gsmpl->chain;
411
394
  }
412
395
 
413
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
396
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
414
397
  llama_synchronize(ctx);
415
398
 
416
399
  // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -418,11 +401,42 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
418
401
 
419
402
  llama_token id = LLAMA_TOKEN_NULL;
420
403
 
404
+ auto & grmr = gsmpl->grmr;
421
405
  auto & chain = gsmpl->chain;
422
406
  auto & cur_p = gsmpl->cur_p; // initialized by set_logits
423
407
 
424
408
  gsmpl->set_logits(ctx, idx);
425
409
 
410
+ if (grammar_first) {
411
+ llama_sampler_apply(grmr, &cur_p);
412
+ }
413
+
414
+ llama_sampler_apply(chain, &cur_p);
415
+
416
+ id = cur_p.data[cur_p.selected].id;
417
+
418
+ if (grammar_first) {
419
+ return id;
420
+ }
421
+
422
+ // check if it the sampled token fits the grammar (grammar-based rejection sampling)
423
+ {
424
+ llama_token_data single_token_data = { id, 1.0f, 0.0f };
425
+ llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
426
+
427
+ llama_sampler_apply(grmr, &single_token_data_array);
428
+
429
+ const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
430
+ if (is_valid) {
431
+ return id;
432
+ }
433
+ }
434
+
435
+ // resampling:
436
+ // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
437
+ gsmpl->set_logits(ctx, idx);
438
+
439
+ llama_sampler_apply(grmr, &cur_p);
426
440
  llama_sampler_apply(chain, &cur_p);
427
441
 
428
442
  GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -432,7 +446,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
432
446
  return id;
433
447
  }
434
448
 
435
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
449
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
436
450
  GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
437
451
 
438
452
  std::vector<llama_token> result;
@@ -440,7 +454,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
440
454
 
441
455
  size_t i = 0;
442
456
  for (; i < draft.size(); i++) {
443
- const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
457
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
444
458
 
445
459
  common_sampler_accept(gsmpl, id, true);
446
460
 
@@ -452,7 +466,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
452
466
  }
453
467
 
454
468
  if (i == draft.size()) {
455
- const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
469
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
456
470
 
457
471
  common_sampler_accept(gsmpl, id, true);
458
472
 
@@ -462,13 +476,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
462
476
  return result;
463
477
  }
464
478
 
465
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
479
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
466
480
  std::vector<int> idxs(draft.size() + 1);
467
481
  for (size_t i = 0; i < idxs.size(); ++i) {
468
482
  idxs[i] = i;
469
483
  }
470
484
 
471
- return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
485
+ return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
472
486
  }
473
487
 
474
488
  uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@@ -57,7 +57,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
57
57
  // - check if the token fits the grammar (if any)
58
58
  // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
59
59
  //
60
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
60
+ // if grammar_first is true, the grammar is applied before the samplers (slower)
61
+ // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
62
+ //
63
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
61
64
 
62
65
  // generalized version of common_sampler_sample
63
66
  //
@@ -75,10 +78,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
75
78
  //
76
79
  // returns at least 1 token, up to idxs.size()
77
80
  //
78
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
81
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
79
82
 
80
83
  // assume idxs == [ 0, 1, 2, ..., draft.size() ]
81
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
84
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
82
85
 
83
86
  uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
84
87
 
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
315
315
  for (int i = 0; i < params.n_draft; ++i) {
316
316
  common_batch_clear(batch);
317
317
 
318
- common_sampler_sample(smpl, ctx_dft, 0);
318
+ common_sampler_sample(smpl, ctx_dft, 0, true);
319
319
 
320
320
  const auto * cur_p = common_sampler_get_candidates(smpl, true);
321
321
 
@@ -458,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
458
458
  if (GGML_RV_ZFH)
459
459
  string(APPEND MARCH_STR "_zfh")
460
460
  endif()
461
+
461
462
  if (GGML_XTHEADVECTOR)
462
463
  string(APPEND MARCH_STR "_xtheadvector")
463
464
  elseif (GGML_RVV)
@@ -465,6 +466,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
465
466
  if (GGML_RV_ZVFH)
466
467
  string(APPEND MARCH_STR "_zvfh")
467
468
  endif()
469
+ if (GGML_RV_ZVFBFWMA)
470
+ string(APPEND MARCH_STR "_zvfbfwma")
471
+ endif()
468
472
  endif()
469
473
  if (GGML_RV_ZICBOP)
470
474
  string(APPEND MARCH_STR "_zicbop")