@novastera-oss/llamarn 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +12 -8
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +46 -65
  13. package/cpp/LlamaCppModel.h +5 -0
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/README.md +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
  17. package/cpp/llama.cpp/common/arg.cpp +8 -6
  18. package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
  19. package/cpp/llama.cpp/common/chat-parser.h +2 -1
  20. package/cpp/llama.cpp/common/chat.cpp +4 -4
  21. package/cpp/llama.cpp/common/common.cpp +2 -0
  22. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  23. package/cpp/llama.cpp/common/json-partial.h +2 -1
  24. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  25. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
  27. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  28. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  30. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
  32. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
  35. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  39. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
  41. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  42. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  43. package/cpp/llama.cpp/include/llama.h +12 -8
  44. package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
  45. package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
  46. package/cpp/llama.cpp/src/llama-batch.h +15 -10
  47. package/cpp/llama.cpp/src/llama-context.cpp +226 -151
  48. package/cpp/llama.cpp/src/llama-context.h +25 -8
  49. package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
  50. package/cpp/llama.cpp/src/llama-graph.h +25 -24
  51. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
  52. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
  53. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
  54. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
  55. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
  56. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
  57. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
  58. package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
  59. package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
  60. package/cpp/llama.cpp/src/llama-memory.h +44 -0
  61. package/cpp/llama.cpp/src/llama-model.cpp +23 -16
  62. package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
  63. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  64. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  65. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  66. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  67. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  68. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  69. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  70. package/cpp/rn-completion.cpp +101 -52
  71. package/cpp/rn-utils.hpp +8 -1
  72. package/ios/include/common/minja/chat-template.hpp +1 -1
  73. package/ios/include/common/minja/minja.hpp +1 -1
  74. package/ios/include/json-schema-to-grammar.h +4 -4
  75. package/ios/include/llama.h +12 -8
  76. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  77. package/ios/libs/llama.xcframework/Info.plist +22 -22
  78. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  79. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
  80. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  81. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
  82. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  83. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  84. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  85. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
  86. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  87. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  88. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  89. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  90. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  91. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
  92. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  93. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
  94. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  95. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
  96. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  97. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  98. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
  99. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  100. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  101. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  102. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
  103. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  104. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
  105. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  106. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  107. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
  108. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
  109. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  110. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  111. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  112. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  113. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
  114. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  115. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
  116. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  117. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  118. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
  119. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
  120. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  121. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  122. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  123. package/package.json +1 -1
@@ -147,30 +147,23 @@ CompletionResult run_completion(
147
147
  json data = options.to_json();
148
148
  // Prepare the sampling parameters
149
149
  const auto& params = rn_ctx->params;
150
-
151
- // Set the prompt
152
- if (data.contains("prompt")) {
153
- // Tokenize the prompt
154
- const auto& tokenized_prompts = tokenize_input_prompts(rn_ctx->vocab, data["prompt"], true, true);
155
- if (tokenized_prompts.empty() || tokenized_prompts[0].empty()) {
156
- result.success = false;
157
- result.error_msg = "Empty prompt";
158
- result.error_type = RN_ERROR_INVALID_PARAM;
159
- return result;
150
+
151
+ // Create a copy of sampling parameters and apply grammar if provided
152
+ common_params_sampling sampling_params = params.sampling;
153
+ if (!options.grammar.empty()) {
154
+ sampling_params.grammar = options.grammar;
155
+ // Force grammar_lazy to false whenever tools are present to ensure strict JSON format enforcement
156
+ if (!options.tools.empty()) {
157
+ sampling_params.grammar_lazy = false;
158
+ } else {
159
+ sampling_params.grammar_lazy = options.grammar_lazy;
160
+ }
161
+ // Pass grammar_triggers if any were provided by chat_params and passed via options
162
+ if (!options.grammar_triggers.empty()) {
163
+ sampling_params.grammar_triggers = options.grammar_triggers;
160
164
  }
161
- state.prompt_tokens = std::move(tokenized_prompts[0]);
162
- } else {
163
- result.success = false;
164
- result.error_msg = "No prompt provided";
165
- result.error_type = RN_ERROR_INVALID_PARAM;
166
- return result;
167
165
  }
168
166
 
169
- // Configure state
170
- state.n_ctx = llama_n_ctx(rn_ctx->ctx);
171
- state.n_predict = options.n_predict > 0 ? options.n_predict : params.n_predict;
172
- state.n_remaining = state.n_predict;
173
-
174
167
  // Parse tool_choice
175
168
  if (options.tool_choice == "auto") {
176
169
  state.tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
@@ -179,8 +172,8 @@ CompletionResult run_completion(
179
172
  } else if (options.tool_choice == "required") {
180
173
  state.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
181
174
  }
182
- // Initialize the sampler
183
- state.sampler = common_sampler_init(rn_ctx->model, params.sampling);
175
+ // Initialize the sampler with the updated sampling parameters
176
+ state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
184
177
  if (!state.sampler) {
185
178
  result.success = false;
186
179
  result.error_msg = "Failed to initialize sampler";
@@ -201,6 +194,29 @@ CompletionResult run_completion(
201
194
  }
202
195
  }
203
196
 
197
+ // Set the prompt
198
+ if (data.contains("prompt")) {
199
+ // Tokenize the prompt
200
+ const auto& tokenized_prompts = tokenize_input_prompts(rn_ctx->vocab, data["prompt"], true, true);
201
+ if (tokenized_prompts.empty() || tokenized_prompts[0].empty()) {
202
+ result.success = false;
203
+ result.error_msg = "Empty prompt";
204
+ result.error_type = RN_ERROR_INVALID_PARAM;
205
+ return result;
206
+ }
207
+ state.prompt_tokens = std::move(tokenized_prompts[0]);
208
+ } else {
209
+ result.success = false;
210
+ result.error_msg = "No prompt provided";
211
+ result.error_type = RN_ERROR_INVALID_PARAM;
212
+ return result;
213
+ }
214
+
215
+ // Configure state
216
+ state.n_ctx = llama_n_ctx(rn_ctx->ctx);
217
+ state.n_predict = options.n_predict > 0 ? options.n_predict : params.n_predict;
218
+ state.n_remaining = state.n_predict;
219
+
204
220
  // Process the prompt
205
221
  for (int i = 0; i < (int)state.prompt_tokens.size(); ++i) {
206
222
  llama_token token = state.prompt_tokens[i];
@@ -222,12 +238,22 @@ CompletionResult run_completion(
222
238
  return result;
223
239
  }
224
240
 
225
- common_sampler_accept(state.sampler, token, true);
241
+ // For lazy grammars, we need to accept prompt tokens to properly set up the grammar state
242
+ // For non-lazy grammars, we only accept if no grammar is present (grammar needs clean state)
243
+ if (sampling_params.grammar.empty() || sampling_params.grammar_lazy) {
244
+ common_sampler_accept(state.sampler, token, true);
245
+ }
226
246
  state.n_past++;
227
247
  }
228
248
 
229
249
  result.n_prompt_tokens = state.prompt_tokens.size();
230
250
 
251
+ // If using a non-lazy grammar, ensure the sampler is in a clean state for the grammar
252
+ if (!sampling_params.grammar.empty() && !sampling_params.grammar_lazy) {
253
+ common_sampler_free(state.sampler);
254
+ state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
255
+ }
256
+
231
257
  // Start generating tokens
232
258
  const int64_t t_start_generation = ggml_time_us();
233
259
 
@@ -328,6 +354,15 @@ CompletionResult run_chat_completion(
328
354
  std::function<bool(const std::string&, bool)> callback) {
329
355
 
330
356
  CompletionResult result;
357
+ // Log incoming tools via callback
358
+ /*
359
+ if (callback) {
360
+ std::string tools_json_str = options.tools.dump(2);
361
+ std::string debug_msg = "[DEBUG RN_COMPLETION_OPTIONS_TOOLS] options.tools JSON: " + tools_json_str;
362
+ callback(debug_msg, false); // false for is_done
363
+ }
364
+ */
365
+ completion_state state;
331
366
 
332
367
  if (!rn_ctx || !rn_ctx->model || !rn_ctx->ctx) {
333
368
  result.success = false;
@@ -362,9 +397,9 @@ CompletionResult run_chat_completion(
362
397
  // Parse tools if present
363
398
  if (data.contains("tools") && !data["tools"].empty()) {
364
399
  template_inputs.tools = common_chat_tools_parse_oaicompat(data["tools"]);
365
- // Check if parallel tool calls are allowed (advanced feature)
366
- template_inputs.parallel_tool_calls = data.contains("parallel_tool_calls") ?
367
- json_value(data, "parallel_tool_calls", false) : false;
400
+ // Force parallel_tool_calls to true if tools are present, as this generally
401
+ // aligns with grammars expecting a list of tool calls.
402
+ template_inputs.parallel_tool_calls = true;
368
403
  }
369
404
 
370
405
  // Parse tool_choice if present
@@ -378,13 +413,42 @@ CompletionResult run_chat_completion(
378
413
  // Apply template
379
414
  const auto& chat_params = common_chat_templates_apply(rn_ctx->chat_templates.get(), template_inputs);
380
415
 
381
- // Set up completion options
382
416
  CompletionOptions cmpl_options = options;
383
417
  cmpl_options.prompt = chat_params.prompt;
384
418
 
385
- // Apply grammar if needed
386
419
  if (!chat_params.grammar.empty()) {
387
420
  cmpl_options.grammar = chat_params.grammar;
421
+ // Always force grammar_lazy to false when tools are present
422
+ if (!template_inputs.tools.empty()) {
423
+ cmpl_options.grammar_lazy = false;
424
+ } else {
425
+ // Only use chat_params.grammar_lazy if no tools are present
426
+ cmpl_options.grammar_lazy = chat_params.grammar_lazy;
427
+ }
428
+ // Default to grammar_triggers provided by chat_params
429
+ cmpl_options.grammar_triggers = chat_params.grammar_triggers;
430
+
431
+ bool original_grammar_lazy = chat_params.grammar_lazy; // Store original for logging
432
+
433
+ // Add a debug log to observe final grammar_lazy and grammar_triggers
434
+ /*
435
+ if (callback) {
436
+ std::string tool_choice_str;
437
+ switch (template_inputs.tool_choice) {
438
+ case COMMON_CHAT_TOOL_CHOICE_AUTO: tool_choice_str = "auto"; break;
439
+ case COMMON_CHAT_TOOL_CHOICE_NONE: tool_choice_str = "none"; break;
440
+ case COMMON_CHAT_TOOL_CHOICE_REQUIRED: tool_choice_str = "required"; break;
441
+ default: tool_choice_str = "unknown"; break;
442
+ }
443
+ std::string debug_msg = "[DEBUG CHAT_PARAMS] grammar_lazy: " +
444
+ std::string(cmpl_options.grammar_lazy ? "true" : "false") +
445
+ " | grammar_triggers_count: " + std::to_string(cmpl_options.grammar_triggers.size()) + // Log triggers from cmpl_options
446
+ " | For Tool Choice: " + tool_choice_str +
447
+ " | Parallel Tool Calls: " + std::string(template_inputs.parallel_tool_calls ? "true" : "false") +
448
+ " | Original chat_params.grammar_lazy: " + std::string(original_grammar_lazy ? "true" : "false"); // Log original lazy
449
+ callback(debug_msg, false);
450
+ }
451
+ */
388
452
  }
389
453
 
390
454
  // Run standard completion with the processed prompt
@@ -435,31 +499,15 @@ CompletionResult run_chat_completion(
435
499
 
436
500
  // Add parsed content and tool calls if available
437
501
  if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
438
- // Set content to the parsed content (may be null for tool-only responses)
439
- if (!parsed_msg.content.empty()) {
440
- choice["message"]["content"] = parsed_msg.content;
441
- } else {
442
- choice["message"]["content"] = nullptr;
443
- }
444
-
445
- // Add tool calls to the message
446
- json tool_calls = json::array();
447
- for (const auto& tool_call : parsed_msg.tool_calls) {
448
- json tc = {
449
- {"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
450
- {"type", "function"},
451
- {"function", {
452
- {"name", tool_call.name},
453
- {"arguments", tool_call.arguments}
454
- }}
455
- };
456
- tool_calls.push_back(tc);
457
- }
458
- choice["message"]["tool_calls"] = tool_calls;
502
+ // Use the server.cpp approach: let the common_chat_msg handle the JSON conversion
503
+ choice["message"] = parsed_msg.to_json_oaicompat<json>();
459
504
  choice["finish_reason"] = "tool_calls";
505
+ } else if (has_parsed_content && !parsed_msg.content.empty()) {
506
+ // Regular text response with parsed content
507
+ choice["message"]["content"] = parsed_msg.content;
460
508
  } else {
461
- // Regular text response
462
- choice["message"]["content"] = has_parsed_content ? parsed_msg.content : result.content;
509
+ // Fallback to raw content if parsing failed or no tools
510
+ choice["message"]["content"] = result.content;
463
511
  }
464
512
 
465
513
  choices.push_back(choice);
@@ -487,3 +535,4 @@ CompletionResult run_chat_completion(
487
535
 
488
536
  } // namespace facebook::react
489
537
 
538
+
package/cpp/rn-utils.hpp CHANGED
@@ -10,7 +10,7 @@
10
10
 
11
11
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
12
12
  #define JSON_ASSERT GGML_ASSERT
13
- #include "json.hpp"
13
+ #include "nlohmann/json.hpp"
14
14
  #include "base64.hpp"
15
15
  #include "chat.h"
16
16
 
@@ -66,6 +66,7 @@ struct CompletionOptions {
66
66
  int seed = -1;
67
67
  json tools; // tools for function calling
68
68
  std::string tool_choice = "auto"; // tool choice mode: "auto", "none", or "required"
69
+ std::vector<common_grammar_trigger> grammar_triggers; // For lazy grammar
69
70
 
70
71
  // Convert to JSON for the completion API
71
72
  json to_json() const {
@@ -98,6 +99,12 @@ struct CompletionOptions {
98
99
  j["tools"] = tools;
99
100
  j["tool_choice"] = tool_choice;
100
101
  }
102
+ // Add grammar_triggers if available (mainly for internal use, not direct API option)
103
+ if (!grammar_triggers.empty()) {
104
+ // This part is tricky as json can't directly hold common_grammar_trigger easily.
105
+ // For now, we'll skip adding it to the generic to_json() as it's passed internally.
106
+ // If it were needed for an API, we'd need a proper serialization for grammar_triggers.
107
+ }
101
108
  return j;
102
109
  }
103
110
 
@@ -22,7 +22,7 @@
22
22
  #include <string>
23
23
  #include <vector>
24
24
 
25
- #include <json.hpp>
25
+ #include <nlohmann/json.hpp>
26
26
 
27
27
  using json = nlohmann::ordered_json;
28
28
 
@@ -29,7 +29,7 @@
29
29
  #include <utility>
30
30
  #include <vector>
31
31
 
32
- #include <json.hpp>
32
+ #include <nlohmann/json.hpp>
33
33
 
34
34
  using json = nlohmann::ordered_json;
35
35
 
@@ -1,9 +1,9 @@
1
1
  #pragma once
2
2
 
3
- #include "ggml.h"
4
- // Change JSON_ASSERT from assert() to GGML_ASSERT:
5
- #define JSON_ASSERT GGML_ASSERT
6
- #include "json.hpp"
3
+ #include <nlohmann/json_fwd.hpp>
4
+
5
+ #include <functional>
6
+ #include <string>
7
7
 
8
8
  std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
9
9
  bool force_gbnf = false);
@@ -259,9 +259,9 @@ extern "C" {
259
259
  llama_token * token;
260
260
  float * embd;
261
261
  llama_pos * pos;
262
- int32_t * n_seq_id;
263
- llama_seq_id ** seq_id;
264
- int8_t * logits; // TODO: rename this to "output"
262
+ int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
263
+ llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
264
+ int8_t * logits; // TODO: rename this to "output"
265
265
  } llama_batch;
266
266
 
267
267
  enum llama_model_kv_override_type {
@@ -366,6 +366,8 @@ extern "C" {
366
366
  bool no_perf; // measure performance timings
367
367
  bool op_offload; // offload host tensor operations to device
368
368
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
369
+ // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
370
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
369
371
  };
370
372
 
371
373
  // model quantization parameters
@@ -502,6 +504,7 @@ extern "C" {
502
504
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
503
505
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
504
506
  LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
507
+ LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
505
508
 
506
509
  // Get the model's RoPE frequency scaling factor
507
510
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -652,7 +655,6 @@ extern "C" {
652
655
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
653
656
  // If the KV cache is RoPEd, the KV data is updated accordingly:
654
657
  // - lazily on next llama_decode()
655
- // - explicitly with llama_kv_self_update()
656
658
  // p0 < 0 : [0, p1]
657
659
  // p1 < 0 : [p0, inf)
658
660
  LLAMA_API void llama_kv_self_seq_add(
@@ -665,7 +667,6 @@ extern "C" {
665
667
  // Integer division of the positions by factor of `d > 1`
666
668
  // If the KV cache is RoPEd, the KV data is updated accordingly:
667
669
  // - lazily on next llama_decode()
668
- // - explicitly with llama_kv_self_update()
669
670
  // p0 < 0 : [0, p1]
670
671
  // p1 < 0 : [p0, inf)
671
672
  LLAMA_API void llama_kv_self_seq_div(
@@ -677,12 +678,14 @@ extern "C" {
677
678
 
678
679
  // Returns the smallest position present in the KV cache for the specified sequence
679
680
  // This is typically non-zero only for SWA caches
681
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
680
682
  // Return -1 if the sequence is empty
681
683
  LLAMA_API llama_pos llama_kv_self_seq_pos_min(
682
684
  struct llama_context * ctx,
683
685
  llama_seq_id seq_id);
684
686
 
685
687
  // Returns the largest position present in the KV cache for the specified sequence
688
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
686
689
  // Return -1 if the sequence is empty
687
690
  LLAMA_API llama_pos llama_kv_self_seq_pos_max(
688
691
  struct llama_context * ctx,
@@ -691,14 +694,15 @@ extern "C" {
691
694
  // Defragment the KV cache
692
695
  // This will be applied:
693
696
  // - lazily on next llama_decode()
694
- // - explicitly with llama_kv_self_update()
695
- LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
697
+ LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
698
+ "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
696
699
 
697
700
  // Check if the context supports KV cache shifting
698
701
  LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
699
702
 
700
703
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
701
- LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
704
+ LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
705
+ "simply remove this call, updates are applied lazily on the next llama_decode()");
702
706
 
703
707
  //
704
708
  // State / sessions