@novastera-oss/llamarn 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +12 -8
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +46 -65
- package/cpp/LlamaCppModel.h +5 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
- package/cpp/llama.cpp/common/arg.cpp +8 -6
- package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
- package/cpp/llama.cpp/common/chat-parser.h +2 -1
- package/cpp/llama.cpp/common/chat.cpp +4 -4
- package/cpp/llama.cpp/common/common.cpp +2 -0
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/include/llama.h +12 -8
- package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
- package/cpp/llama.cpp/src/llama-batch.h +15 -10
- package/cpp/llama.cpp/src/llama-context.cpp +226 -151
- package/cpp/llama.cpp/src/llama-context.h +25 -8
- package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
- package/cpp/llama.cpp/src/llama-graph.h +25 -24
- package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
- package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
- package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
- package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
- package/cpp/llama.cpp/src/llama-memory.h +44 -0
- package/cpp/llama.cpp/src/llama-model.cpp +23 -16
- package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +63 -8
- package/cpp/rn-utils.hpp +8 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +12 -8
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +22 -22
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
package/cpp/rn-completion.cpp
CHANGED
|
@@ -152,6 +152,16 @@ CompletionResult run_completion(
|
|
|
152
152
|
common_params_sampling sampling_params = params.sampling;
|
|
153
153
|
if (!options.grammar.empty()) {
|
|
154
154
|
sampling_params.grammar = options.grammar;
|
|
155
|
+
// Force grammar_lazy to false whenever tools are present to ensure strict JSON format enforcement
|
|
156
|
+
if (!options.tools.empty()) {
|
|
157
|
+
sampling_params.grammar_lazy = false;
|
|
158
|
+
} else {
|
|
159
|
+
sampling_params.grammar_lazy = options.grammar_lazy;
|
|
160
|
+
}
|
|
161
|
+
// Pass grammar_triggers if any were provided by chat_params and passed via options
|
|
162
|
+
if (!options.grammar_triggers.empty()) {
|
|
163
|
+
sampling_params.grammar_triggers = options.grammar_triggers;
|
|
164
|
+
}
|
|
155
165
|
}
|
|
156
166
|
|
|
157
167
|
// Parse tool_choice
|
|
@@ -228,9 +238,9 @@ CompletionResult run_completion(
|
|
|
228
238
|
return result;
|
|
229
239
|
}
|
|
230
240
|
|
|
231
|
-
//
|
|
232
|
-
//
|
|
233
|
-
if (sampling_params.grammar.empty()) {
|
|
241
|
+
// For lazy grammars, we need to accept prompt tokens to properly set up the grammar state
|
|
242
|
+
// For non-lazy grammars, we only accept if no grammar is present (grammar needs clean state)
|
|
243
|
+
if (sampling_params.grammar.empty() || sampling_params.grammar_lazy) {
|
|
234
244
|
common_sampler_accept(state.sampler, token, true);
|
|
235
245
|
}
|
|
236
246
|
state.n_past++;
|
|
@@ -238,6 +248,12 @@ CompletionResult run_completion(
|
|
|
238
248
|
|
|
239
249
|
result.n_prompt_tokens = state.prompt_tokens.size();
|
|
240
250
|
|
|
251
|
+
// If using a non-lazy grammar, ensure the sampler is in a clean state for the grammar
|
|
252
|
+
if (!sampling_params.grammar.empty() && !sampling_params.grammar_lazy) {
|
|
253
|
+
common_sampler_free(state.sampler);
|
|
254
|
+
state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
|
|
255
|
+
}
|
|
256
|
+
|
|
241
257
|
// Start generating tokens
|
|
242
258
|
const int64_t t_start_generation = ggml_time_us();
|
|
243
259
|
|
|
@@ -338,6 +354,15 @@ CompletionResult run_chat_completion(
|
|
|
338
354
|
std::function<bool(const std::string&, bool)> callback) {
|
|
339
355
|
|
|
340
356
|
CompletionResult result;
|
|
357
|
+
// Log incoming tools via callback
|
|
358
|
+
/*
|
|
359
|
+
if (callback) {
|
|
360
|
+
std::string tools_json_str = options.tools.dump(2);
|
|
361
|
+
std::string debug_msg = "[DEBUG RN_COMPLETION_OPTIONS_TOOLS] options.tools JSON: " + tools_json_str;
|
|
362
|
+
callback(debug_msg, false); // false for is_done
|
|
363
|
+
}
|
|
364
|
+
*/
|
|
365
|
+
completion_state state;
|
|
341
366
|
|
|
342
367
|
if (!rn_ctx || !rn_ctx->model || !rn_ctx->ctx) {
|
|
343
368
|
result.success = false;
|
|
@@ -372,9 +397,9 @@ CompletionResult run_chat_completion(
|
|
|
372
397
|
// Parse tools if present
|
|
373
398
|
if (data.contains("tools") && !data["tools"].empty()) {
|
|
374
399
|
template_inputs.tools = common_chat_tools_parse_oaicompat(data["tools"]);
|
|
375
|
-
//
|
|
376
|
-
|
|
377
|
-
|
|
400
|
+
// Force parallel_tool_calls to true if tools are present, as this generally
|
|
401
|
+
// aligns with grammars expecting a list of tool calls.
|
|
402
|
+
template_inputs.parallel_tool_calls = true;
|
|
378
403
|
}
|
|
379
404
|
|
|
380
405
|
// Parse tool_choice if present
|
|
@@ -388,13 +413,42 @@ CompletionResult run_chat_completion(
|
|
|
388
413
|
// Apply template
|
|
389
414
|
const auto& chat_params = common_chat_templates_apply(rn_ctx->chat_templates.get(), template_inputs);
|
|
390
415
|
|
|
391
|
-
// Set up completion options
|
|
392
416
|
CompletionOptions cmpl_options = options;
|
|
393
417
|
cmpl_options.prompt = chat_params.prompt;
|
|
394
418
|
|
|
395
|
-
// Apply grammar if needed
|
|
396
419
|
if (!chat_params.grammar.empty()) {
|
|
397
420
|
cmpl_options.grammar = chat_params.grammar;
|
|
421
|
+
// Always force grammar_lazy to false when tools are present
|
|
422
|
+
if (!template_inputs.tools.empty()) {
|
|
423
|
+
cmpl_options.grammar_lazy = false;
|
|
424
|
+
} else {
|
|
425
|
+
// Only use chat_params.grammar_lazy if no tools are present
|
|
426
|
+
cmpl_options.grammar_lazy = chat_params.grammar_lazy;
|
|
427
|
+
}
|
|
428
|
+
// Default to grammar_triggers provided by chat_params
|
|
429
|
+
cmpl_options.grammar_triggers = chat_params.grammar_triggers;
|
|
430
|
+
|
|
431
|
+
bool original_grammar_lazy = chat_params.grammar_lazy; // Store original for logging
|
|
432
|
+
|
|
433
|
+
// Add a debug log to observe final grammar_lazy and grammar_triggers
|
|
434
|
+
/*
|
|
435
|
+
if (callback) {
|
|
436
|
+
std::string tool_choice_str;
|
|
437
|
+
switch (template_inputs.tool_choice) {
|
|
438
|
+
case COMMON_CHAT_TOOL_CHOICE_AUTO: tool_choice_str = "auto"; break;
|
|
439
|
+
case COMMON_CHAT_TOOL_CHOICE_NONE: tool_choice_str = "none"; break;
|
|
440
|
+
case COMMON_CHAT_TOOL_CHOICE_REQUIRED: tool_choice_str = "required"; break;
|
|
441
|
+
default: tool_choice_str = "unknown"; break;
|
|
442
|
+
}
|
|
443
|
+
std::string debug_msg = "[DEBUG CHAT_PARAMS] grammar_lazy: " +
|
|
444
|
+
std::string(cmpl_options.grammar_lazy ? "true" : "false") +
|
|
445
|
+
" | grammar_triggers_count: " + std::to_string(cmpl_options.grammar_triggers.size()) + // Log triggers from cmpl_options
|
|
446
|
+
" | For Tool Choice: " + tool_choice_str +
|
|
447
|
+
" | Parallel Tool Calls: " + std::string(template_inputs.parallel_tool_calls ? "true" : "false") +
|
|
448
|
+
" | Original chat_params.grammar_lazy: " + std::string(original_grammar_lazy ? "true" : "false"); // Log original lazy
|
|
449
|
+
callback(debug_msg, false);
|
|
450
|
+
}
|
|
451
|
+
*/
|
|
398
452
|
}
|
|
399
453
|
|
|
400
454
|
// Run standard completion with the processed prompt
|
|
@@ -481,3 +535,4 @@ CompletionResult run_chat_completion(
|
|
|
481
535
|
|
|
482
536
|
} // namespace facebook::react
|
|
483
537
|
|
|
538
|
+
|
package/cpp/rn-utils.hpp
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
12
12
|
#define JSON_ASSERT GGML_ASSERT
|
|
13
|
-
#include "json.hpp"
|
|
13
|
+
#include "nlohmann/json.hpp"
|
|
14
14
|
#include "base64.hpp"
|
|
15
15
|
#include "chat.h"
|
|
16
16
|
|
|
@@ -66,6 +66,7 @@ struct CompletionOptions {
|
|
|
66
66
|
int seed = -1;
|
|
67
67
|
json tools; // tools for function calling
|
|
68
68
|
std::string tool_choice = "auto"; // tool choice mode: "auto", "none", or "required"
|
|
69
|
+
std::vector<common_grammar_trigger> grammar_triggers; // For lazy grammar
|
|
69
70
|
|
|
70
71
|
// Convert to JSON for the completion API
|
|
71
72
|
json to_json() const {
|
|
@@ -98,6 +99,12 @@ struct CompletionOptions {
|
|
|
98
99
|
j["tools"] = tools;
|
|
99
100
|
j["tool_choice"] = tool_choice;
|
|
100
101
|
}
|
|
102
|
+
// Add grammar_triggers if available (mainly for internal use, not direct API option)
|
|
103
|
+
if (!grammar_triggers.empty()) {
|
|
104
|
+
// This part is tricky as json can't directly hold common_grammar_trigger easily.
|
|
105
|
+
// For now, we'll skip adding it to the generic to_json() as it's passed internally.
|
|
106
|
+
// If it were needed for an API, we'd need a proper serialization for grammar_triggers.
|
|
107
|
+
}
|
|
101
108
|
return j;
|
|
102
109
|
}
|
|
103
110
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
-
#include
|
|
4
|
-
|
|
5
|
-
#
|
|
6
|
-
#include
|
|
3
|
+
#include <nlohmann/json_fwd.hpp>
|
|
4
|
+
|
|
5
|
+
#include <functional>
|
|
6
|
+
#include <string>
|
|
7
7
|
|
|
8
8
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
|
9
9
|
bool force_gbnf = false);
|
package/ios/include/llama.h
CHANGED
|
@@ -259,9 +259,9 @@ extern "C" {
|
|
|
259
259
|
llama_token * token;
|
|
260
260
|
float * embd;
|
|
261
261
|
llama_pos * pos;
|
|
262
|
-
int32_t * n_seq_id;
|
|
263
|
-
llama_seq_id ** seq_id;
|
|
264
|
-
int8_t * logits;
|
|
262
|
+
int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
|
|
263
|
+
llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
|
|
264
|
+
int8_t * logits; // TODO: rename this to "output"
|
|
265
265
|
} llama_batch;
|
|
266
266
|
|
|
267
267
|
enum llama_model_kv_override_type {
|
|
@@ -366,6 +366,8 @@ extern "C" {
|
|
|
366
366
|
bool no_perf; // measure performance timings
|
|
367
367
|
bool op_offload; // offload host tensor operations to device
|
|
368
368
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
369
|
+
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
370
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
369
371
|
};
|
|
370
372
|
|
|
371
373
|
// model quantization parameters
|
|
@@ -502,6 +504,7 @@ extern "C" {
|
|
|
502
504
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
503
505
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
504
506
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
|
507
|
+
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
|
505
508
|
|
|
506
509
|
// Get the model's RoPE frequency scaling factor
|
|
507
510
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
@@ -652,7 +655,6 @@ extern "C" {
|
|
|
652
655
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
653
656
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
654
657
|
// - lazily on next llama_decode()
|
|
655
|
-
// - explicitly with llama_kv_self_update()
|
|
656
658
|
// p0 < 0 : [0, p1]
|
|
657
659
|
// p1 < 0 : [p0, inf)
|
|
658
660
|
LLAMA_API void llama_kv_self_seq_add(
|
|
@@ -665,7 +667,6 @@ extern "C" {
|
|
|
665
667
|
// Integer division of the positions by factor of `d > 1`
|
|
666
668
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
667
669
|
// - lazily on next llama_decode()
|
|
668
|
-
// - explicitly with llama_kv_self_update()
|
|
669
670
|
// p0 < 0 : [0, p1]
|
|
670
671
|
// p1 < 0 : [p0, inf)
|
|
671
672
|
LLAMA_API void llama_kv_self_seq_div(
|
|
@@ -677,12 +678,14 @@ extern "C" {
|
|
|
677
678
|
|
|
678
679
|
// Returns the smallest position present in the KV cache for the specified sequence
|
|
679
680
|
// This is typically non-zero only for SWA caches
|
|
681
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
680
682
|
// Return -1 if the sequence is empty
|
|
681
683
|
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
682
684
|
struct llama_context * ctx,
|
|
683
685
|
llama_seq_id seq_id);
|
|
684
686
|
|
|
685
687
|
// Returns the largest position present in the KV cache for the specified sequence
|
|
688
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
686
689
|
// Return -1 if the sequence is empty
|
|
687
690
|
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
688
691
|
struct llama_context * ctx,
|
|
@@ -691,14 +694,15 @@ extern "C" {
|
|
|
691
694
|
// Defragment the KV cache
|
|
692
695
|
// This will be applied:
|
|
693
696
|
// - lazily on next llama_decode()
|
|
694
|
-
|
|
695
|
-
|
|
697
|
+
LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
|
|
698
|
+
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
696
699
|
|
|
697
700
|
// Check if the context supports KV cache shifting
|
|
698
701
|
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
|
|
699
702
|
|
|
700
703
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
701
|
-
LLAMA_API void llama_kv_self_update(struct llama_context * ctx)
|
|
704
|
+
LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
|
|
705
|
+
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
702
706
|
|
|
703
707
|
//
|
|
704
708
|
// State / sessions
|