@novastera-oss/llamarn 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +12 -8
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +46 -65
- package/cpp/LlamaCppModel.h +5 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
- package/cpp/llama.cpp/common/arg.cpp +8 -6
- package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
- package/cpp/llama.cpp/common/chat-parser.h +2 -1
- package/cpp/llama.cpp/common/chat.cpp +4 -4
- package/cpp/llama.cpp/common/common.cpp +2 -0
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/include/llama.h +12 -8
- package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
- package/cpp/llama.cpp/src/llama-batch.h +15 -10
- package/cpp/llama.cpp/src/llama-context.cpp +226 -151
- package/cpp/llama.cpp/src/llama-context.h +25 -8
- package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
- package/cpp/llama.cpp/src/llama-graph.h +25 -24
- package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
- package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
- package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
- package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
- package/cpp/llama.cpp/src/llama-memory.h +44 -0
- package/cpp/llama.cpp/src/llama-model.cpp +23 -16
- package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +101 -52
- package/cpp/rn-utils.hpp +8 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +12 -8
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +22 -22
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
package/cpp/rn-completion.cpp
CHANGED
|
@@ -147,30 +147,23 @@ CompletionResult run_completion(
|
|
|
147
147
|
json data = options.to_json();
|
|
148
148
|
// Prepare the sampling parameters
|
|
149
149
|
const auto& params = rn_ctx->params;
|
|
150
|
-
|
|
151
|
-
//
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
150
|
+
|
|
151
|
+
// Create a copy of sampling parameters and apply grammar if provided
|
|
152
|
+
common_params_sampling sampling_params = params.sampling;
|
|
153
|
+
if (!options.grammar.empty()) {
|
|
154
|
+
sampling_params.grammar = options.grammar;
|
|
155
|
+
// Force grammar_lazy to false whenever tools are present to ensure strict JSON format enforcement
|
|
156
|
+
if (!options.tools.empty()) {
|
|
157
|
+
sampling_params.grammar_lazy = false;
|
|
158
|
+
} else {
|
|
159
|
+
sampling_params.grammar_lazy = options.grammar_lazy;
|
|
160
|
+
}
|
|
161
|
+
// Pass grammar_triggers if any were provided by chat_params and passed via options
|
|
162
|
+
if (!options.grammar_triggers.empty()) {
|
|
163
|
+
sampling_params.grammar_triggers = options.grammar_triggers;
|
|
160
164
|
}
|
|
161
|
-
state.prompt_tokens = std::move(tokenized_prompts[0]);
|
|
162
|
-
} else {
|
|
163
|
-
result.success = false;
|
|
164
|
-
result.error_msg = "No prompt provided";
|
|
165
|
-
result.error_type = RN_ERROR_INVALID_PARAM;
|
|
166
|
-
return result;
|
|
167
165
|
}
|
|
168
166
|
|
|
169
|
-
// Configure state
|
|
170
|
-
state.n_ctx = llama_n_ctx(rn_ctx->ctx);
|
|
171
|
-
state.n_predict = options.n_predict > 0 ? options.n_predict : params.n_predict;
|
|
172
|
-
state.n_remaining = state.n_predict;
|
|
173
|
-
|
|
174
167
|
// Parse tool_choice
|
|
175
168
|
if (options.tool_choice == "auto") {
|
|
176
169
|
state.tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
|
@@ -179,8 +172,8 @@ CompletionResult run_completion(
|
|
|
179
172
|
} else if (options.tool_choice == "required") {
|
|
180
173
|
state.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
181
174
|
}
|
|
182
|
-
// Initialize the sampler
|
|
183
|
-
state.sampler = common_sampler_init(rn_ctx->model,
|
|
175
|
+
// Initialize the sampler with the updated sampling parameters
|
|
176
|
+
state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
|
|
184
177
|
if (!state.sampler) {
|
|
185
178
|
result.success = false;
|
|
186
179
|
result.error_msg = "Failed to initialize sampler";
|
|
@@ -201,6 +194,29 @@ CompletionResult run_completion(
|
|
|
201
194
|
}
|
|
202
195
|
}
|
|
203
196
|
|
|
197
|
+
// Set the prompt
|
|
198
|
+
if (data.contains("prompt")) {
|
|
199
|
+
// Tokenize the prompt
|
|
200
|
+
const auto& tokenized_prompts = tokenize_input_prompts(rn_ctx->vocab, data["prompt"], true, true);
|
|
201
|
+
if (tokenized_prompts.empty() || tokenized_prompts[0].empty()) {
|
|
202
|
+
result.success = false;
|
|
203
|
+
result.error_msg = "Empty prompt";
|
|
204
|
+
result.error_type = RN_ERROR_INVALID_PARAM;
|
|
205
|
+
return result;
|
|
206
|
+
}
|
|
207
|
+
state.prompt_tokens = std::move(tokenized_prompts[0]);
|
|
208
|
+
} else {
|
|
209
|
+
result.success = false;
|
|
210
|
+
result.error_msg = "No prompt provided";
|
|
211
|
+
result.error_type = RN_ERROR_INVALID_PARAM;
|
|
212
|
+
return result;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Configure state
|
|
216
|
+
state.n_ctx = llama_n_ctx(rn_ctx->ctx);
|
|
217
|
+
state.n_predict = options.n_predict > 0 ? options.n_predict : params.n_predict;
|
|
218
|
+
state.n_remaining = state.n_predict;
|
|
219
|
+
|
|
204
220
|
// Process the prompt
|
|
205
221
|
for (int i = 0; i < (int)state.prompt_tokens.size(); ++i) {
|
|
206
222
|
llama_token token = state.prompt_tokens[i];
|
|
@@ -222,12 +238,22 @@ CompletionResult run_completion(
|
|
|
222
238
|
return result;
|
|
223
239
|
}
|
|
224
240
|
|
|
225
|
-
|
|
241
|
+
// For lazy grammars, we need to accept prompt tokens to properly set up the grammar state
|
|
242
|
+
// For non-lazy grammars, we only accept if no grammar is present (grammar needs clean state)
|
|
243
|
+
if (sampling_params.grammar.empty() || sampling_params.grammar_lazy) {
|
|
244
|
+
common_sampler_accept(state.sampler, token, true);
|
|
245
|
+
}
|
|
226
246
|
state.n_past++;
|
|
227
247
|
}
|
|
228
248
|
|
|
229
249
|
result.n_prompt_tokens = state.prompt_tokens.size();
|
|
230
250
|
|
|
251
|
+
// If using a non-lazy grammar, ensure the sampler is in a clean state for the grammar
|
|
252
|
+
if (!sampling_params.grammar.empty() && !sampling_params.grammar_lazy) {
|
|
253
|
+
common_sampler_free(state.sampler);
|
|
254
|
+
state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
|
|
255
|
+
}
|
|
256
|
+
|
|
231
257
|
// Start generating tokens
|
|
232
258
|
const int64_t t_start_generation = ggml_time_us();
|
|
233
259
|
|
|
@@ -328,6 +354,15 @@ CompletionResult run_chat_completion(
|
|
|
328
354
|
std::function<bool(const std::string&, bool)> callback) {
|
|
329
355
|
|
|
330
356
|
CompletionResult result;
|
|
357
|
+
// Log incoming tools via callback
|
|
358
|
+
/*
|
|
359
|
+
if (callback) {
|
|
360
|
+
std::string tools_json_str = options.tools.dump(2);
|
|
361
|
+
std::string debug_msg = "[DEBUG RN_COMPLETION_OPTIONS_TOOLS] options.tools JSON: " + tools_json_str;
|
|
362
|
+
callback(debug_msg, false); // false for is_done
|
|
363
|
+
}
|
|
364
|
+
*/
|
|
365
|
+
completion_state state;
|
|
331
366
|
|
|
332
367
|
if (!rn_ctx || !rn_ctx->model || !rn_ctx->ctx) {
|
|
333
368
|
result.success = false;
|
|
@@ -362,9 +397,9 @@ CompletionResult run_chat_completion(
|
|
|
362
397
|
// Parse tools if present
|
|
363
398
|
if (data.contains("tools") && !data["tools"].empty()) {
|
|
364
399
|
template_inputs.tools = common_chat_tools_parse_oaicompat(data["tools"]);
|
|
365
|
-
//
|
|
366
|
-
|
|
367
|
-
|
|
400
|
+
// Force parallel_tool_calls to true if tools are present, as this generally
|
|
401
|
+
// aligns with grammars expecting a list of tool calls.
|
|
402
|
+
template_inputs.parallel_tool_calls = true;
|
|
368
403
|
}
|
|
369
404
|
|
|
370
405
|
// Parse tool_choice if present
|
|
@@ -378,13 +413,42 @@ CompletionResult run_chat_completion(
|
|
|
378
413
|
// Apply template
|
|
379
414
|
const auto& chat_params = common_chat_templates_apply(rn_ctx->chat_templates.get(), template_inputs);
|
|
380
415
|
|
|
381
|
-
// Set up completion options
|
|
382
416
|
CompletionOptions cmpl_options = options;
|
|
383
417
|
cmpl_options.prompt = chat_params.prompt;
|
|
384
418
|
|
|
385
|
-
// Apply grammar if needed
|
|
386
419
|
if (!chat_params.grammar.empty()) {
|
|
387
420
|
cmpl_options.grammar = chat_params.grammar;
|
|
421
|
+
// Always force grammar_lazy to false when tools are present
|
|
422
|
+
if (!template_inputs.tools.empty()) {
|
|
423
|
+
cmpl_options.grammar_lazy = false;
|
|
424
|
+
} else {
|
|
425
|
+
// Only use chat_params.grammar_lazy if no tools are present
|
|
426
|
+
cmpl_options.grammar_lazy = chat_params.grammar_lazy;
|
|
427
|
+
}
|
|
428
|
+
// Default to grammar_triggers provided by chat_params
|
|
429
|
+
cmpl_options.grammar_triggers = chat_params.grammar_triggers;
|
|
430
|
+
|
|
431
|
+
bool original_grammar_lazy = chat_params.grammar_lazy; // Store original for logging
|
|
432
|
+
|
|
433
|
+
// Add a debug log to observe final grammar_lazy and grammar_triggers
|
|
434
|
+
/*
|
|
435
|
+
if (callback) {
|
|
436
|
+
std::string tool_choice_str;
|
|
437
|
+
switch (template_inputs.tool_choice) {
|
|
438
|
+
case COMMON_CHAT_TOOL_CHOICE_AUTO: tool_choice_str = "auto"; break;
|
|
439
|
+
case COMMON_CHAT_TOOL_CHOICE_NONE: tool_choice_str = "none"; break;
|
|
440
|
+
case COMMON_CHAT_TOOL_CHOICE_REQUIRED: tool_choice_str = "required"; break;
|
|
441
|
+
default: tool_choice_str = "unknown"; break;
|
|
442
|
+
}
|
|
443
|
+
std::string debug_msg = "[DEBUG CHAT_PARAMS] grammar_lazy: " +
|
|
444
|
+
std::string(cmpl_options.grammar_lazy ? "true" : "false") +
|
|
445
|
+
" | grammar_triggers_count: " + std::to_string(cmpl_options.grammar_triggers.size()) + // Log triggers from cmpl_options
|
|
446
|
+
" | For Tool Choice: " + tool_choice_str +
|
|
447
|
+
" | Parallel Tool Calls: " + std::string(template_inputs.parallel_tool_calls ? "true" : "false") +
|
|
448
|
+
" | Original chat_params.grammar_lazy: " + std::string(original_grammar_lazy ? "true" : "false"); // Log original lazy
|
|
449
|
+
callback(debug_msg, false);
|
|
450
|
+
}
|
|
451
|
+
*/
|
|
388
452
|
}
|
|
389
453
|
|
|
390
454
|
// Run standard completion with the processed prompt
|
|
@@ -435,31 +499,15 @@ CompletionResult run_chat_completion(
|
|
|
435
499
|
|
|
436
500
|
// Add parsed content and tool calls if available
|
|
437
501
|
if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
|
|
438
|
-
//
|
|
439
|
-
|
|
440
|
-
choice["message"]["content"] = parsed_msg.content;
|
|
441
|
-
} else {
|
|
442
|
-
choice["message"]["content"] = nullptr;
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
// Add tool calls to the message
|
|
446
|
-
json tool_calls = json::array();
|
|
447
|
-
for (const auto& tool_call : parsed_msg.tool_calls) {
|
|
448
|
-
json tc = {
|
|
449
|
-
{"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
|
|
450
|
-
{"type", "function"},
|
|
451
|
-
{"function", {
|
|
452
|
-
{"name", tool_call.name},
|
|
453
|
-
{"arguments", tool_call.arguments}
|
|
454
|
-
}}
|
|
455
|
-
};
|
|
456
|
-
tool_calls.push_back(tc);
|
|
457
|
-
}
|
|
458
|
-
choice["message"]["tool_calls"] = tool_calls;
|
|
502
|
+
// Use the server.cpp approach: let the common_chat_msg handle the JSON conversion
|
|
503
|
+
choice["message"] = parsed_msg.to_json_oaicompat<json>();
|
|
459
504
|
choice["finish_reason"] = "tool_calls";
|
|
505
|
+
} else if (has_parsed_content && !parsed_msg.content.empty()) {
|
|
506
|
+
// Regular text response with parsed content
|
|
507
|
+
choice["message"]["content"] = parsed_msg.content;
|
|
460
508
|
} else {
|
|
461
|
-
//
|
|
462
|
-
choice["message"]["content"] =
|
|
509
|
+
// Fallback to raw content if parsing failed or no tools
|
|
510
|
+
choice["message"]["content"] = result.content;
|
|
463
511
|
}
|
|
464
512
|
|
|
465
513
|
choices.push_back(choice);
|
|
@@ -487,3 +535,4 @@ CompletionResult run_chat_completion(
|
|
|
487
535
|
|
|
488
536
|
} // namespace facebook::react
|
|
489
537
|
|
|
538
|
+
|
package/cpp/rn-utils.hpp
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
12
12
|
#define JSON_ASSERT GGML_ASSERT
|
|
13
|
-
#include "json.hpp"
|
|
13
|
+
#include "nlohmann/json.hpp"
|
|
14
14
|
#include "base64.hpp"
|
|
15
15
|
#include "chat.h"
|
|
16
16
|
|
|
@@ -66,6 +66,7 @@ struct CompletionOptions {
|
|
|
66
66
|
int seed = -1;
|
|
67
67
|
json tools; // tools for function calling
|
|
68
68
|
std::string tool_choice = "auto"; // tool choice mode: "auto", "none", or "required"
|
|
69
|
+
std::vector<common_grammar_trigger> grammar_triggers; // For lazy grammar
|
|
69
70
|
|
|
70
71
|
// Convert to JSON for the completion API
|
|
71
72
|
json to_json() const {
|
|
@@ -98,6 +99,12 @@ struct CompletionOptions {
|
|
|
98
99
|
j["tools"] = tools;
|
|
99
100
|
j["tool_choice"] = tool_choice;
|
|
100
101
|
}
|
|
102
|
+
// Add grammar_triggers if available (mainly for internal use, not direct API option)
|
|
103
|
+
if (!grammar_triggers.empty()) {
|
|
104
|
+
// This part is tricky as json can't directly hold common_grammar_trigger easily.
|
|
105
|
+
// For now, we'll skip adding it to the generic to_json() as it's passed internally.
|
|
106
|
+
// If it were needed for an API, we'd need a proper serialization for grammar_triggers.
|
|
107
|
+
}
|
|
101
108
|
return j;
|
|
102
109
|
}
|
|
103
110
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
-
#include
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
#include
|
|
3
|
+
#include <nlohmann/json_fwd.hpp>
|
|
4
|
+
|
|
5
|
+
#include <functional>
|
|
6
|
+
#include <string>
|
|
7
7
|
|
|
8
8
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
|
9
9
|
bool force_gbnf = false);
|
package/ios/include/llama.h
CHANGED
|
@@ -259,9 +259,9 @@ extern "C" {
|
|
|
259
259
|
llama_token * token;
|
|
260
260
|
float * embd;
|
|
261
261
|
llama_pos * pos;
|
|
262
|
-
int32_t * n_seq_id;
|
|
263
|
-
llama_seq_id ** seq_id;
|
|
264
|
-
int8_t * logits;
|
|
262
|
+
int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
|
|
263
|
+
llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
|
|
264
|
+
int8_t * logits; // TODO: rename this to "output"
|
|
265
265
|
} llama_batch;
|
|
266
266
|
|
|
267
267
|
enum llama_model_kv_override_type {
|
|
@@ -366,6 +366,8 @@ extern "C" {
|
|
|
366
366
|
bool no_perf; // measure performance timings
|
|
367
367
|
bool op_offload; // offload host tensor operations to device
|
|
368
368
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
369
|
+
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
370
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
369
371
|
};
|
|
370
372
|
|
|
371
373
|
// model quantization parameters
|
|
@@ -502,6 +504,7 @@ extern "C" {
|
|
|
502
504
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
503
505
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
504
506
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
507
|
+
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
|
505
508
|
|
|
506
509
|
// Get the model's RoPE frequency scaling factor
|
|
507
510
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
@@ -652,7 +655,6 @@ extern "C" {
|
|
|
652
655
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
653
656
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
654
657
|
// - lazily on next llama_decode()
|
|
655
|
-
// - explicitly with llama_kv_self_update()
|
|
656
658
|
// p0 < 0 : [0, p1]
|
|
657
659
|
// p1 < 0 : [p0, inf)
|
|
658
660
|
LLAMA_API void llama_kv_self_seq_add(
|
|
@@ -665,7 +667,6 @@ extern "C" {
|
|
|
665
667
|
// Integer division of the positions by factor of `d > 1`
|
|
666
668
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
667
669
|
// - lazily on next llama_decode()
|
|
668
|
-
// - explicitly with llama_kv_self_update()
|
|
669
670
|
// p0 < 0 : [0, p1]
|
|
670
671
|
// p1 < 0 : [p0, inf)
|
|
671
672
|
LLAMA_API void llama_kv_self_seq_div(
|
|
@@ -677,12 +678,14 @@ extern "C" {
|
|
|
677
678
|
|
|
678
679
|
// Returns the smallest position present in the KV cache for the specified sequence
|
|
679
680
|
// This is typically non-zero only for SWA caches
|
|
681
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
680
682
|
// Return -1 if the sequence is empty
|
|
681
683
|
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
682
684
|
struct llama_context * ctx,
|
|
683
685
|
llama_seq_id seq_id);
|
|
684
686
|
|
|
685
687
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
688
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
686
689
|
// Return -1 if the sequence is empty
|
|
687
690
|
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
688
691
|
struct llama_context * ctx,
|
|
@@ -691,14 +694,15 @@ extern "C" {
|
|
|
691
694
|
// Defragment the KV cache
|
|
692
695
|
// This will be applied:
|
|
693
696
|
// - lazily on next llama_decode()
|
|
694
|
-
|
|
695
|
-
|
|
697
|
+
LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
|
|
698
|
+
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
696
699
|
|
|
697
700
|
// Check if the context supports KV cache shifting
|
|
698
701
|
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
|
|
699
702
|
|
|
700
703
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
701
|
-
LLAMA_API void llama_kv_self_update(struct llama_context * ctx)
|
|
704
|
+
LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
|
|
705
|
+
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
702
706
|
|
|
703
707
|
//
|
|
704
708
|
// State / sessions
|