cui-llama.rn 1.3.5 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -1
- package/android/src/main/CMakeLists.txt +25 -20
- package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
- package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
- package/android/src/main/jni-utils.h +94 -0
- package/android/src/main/jni.cpp +108 -37
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
- package/cpp/common.cpp +1982 -1965
- package/cpp/common.h +665 -657
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +5 -2
- package/cpp/ggml-cpp.h +1 -0
- package/cpp/ggml-cpu-aarch64.cpp +6 -1
- package/cpp/ggml-cpu-quants.c +5 -1
- package/cpp/ggml-cpu.c +14122 -14122
- package/cpp/ggml-cpu.cpp +627 -627
- package/cpp/ggml-impl.h +11 -16
- package/cpp/ggml-metal-impl.h +288 -0
- package/cpp/ggml-metal.m +2 -2
- package/cpp/ggml-opt.cpp +854 -0
- package/cpp/ggml-opt.h +216 -0
- package/cpp/ggml.c +0 -1276
- package/cpp/ggml.h +0 -140
- package/cpp/gguf.cpp +1325 -0
- package/cpp/gguf.h +202 -0
- package/cpp/llama-adapter.cpp +346 -0
- package/cpp/llama-adapter.h +73 -0
- package/cpp/llama-arch.cpp +1434 -0
- package/cpp/llama-arch.h +395 -0
- package/cpp/llama-batch.cpp +368 -0
- package/cpp/llama-batch.h +88 -0
- package/cpp/llama-chat.cpp +567 -0
- package/cpp/llama-chat.h +51 -0
- package/cpp/llama-context.cpp +1771 -0
- package/cpp/llama-context.h +128 -0
- package/cpp/llama-cparams.cpp +1 -0
- package/cpp/llama-cparams.h +37 -0
- package/cpp/llama-cpp.h +30 -0
- package/cpp/llama-grammar.cpp +1 -0
- package/cpp/llama-grammar.h +3 -1
- package/cpp/llama-hparams.cpp +71 -0
- package/cpp/llama-hparams.h +140 -0
- package/cpp/llama-impl.cpp +167 -0
- package/cpp/llama-impl.h +16 -136
- package/cpp/llama-kv-cache.cpp +718 -0
- package/cpp/llama-kv-cache.h +218 -0
- package/cpp/llama-mmap.cpp +589 -0
- package/cpp/llama-mmap.h +67 -0
- package/cpp/llama-model-loader.cpp +1011 -0
- package/cpp/llama-model-loader.h +158 -0
- package/cpp/llama-model.cpp +2202 -0
- package/cpp/llama-model.h +391 -0
- package/cpp/llama-sampling.cpp +117 -4
- package/cpp/llama-vocab.cpp +21 -28
- package/cpp/llama-vocab.h +13 -1
- package/cpp/llama.cpp +12547 -23528
- package/cpp/llama.h +31 -6
- package/cpp/rn-llama.hpp +90 -87
- package/cpp/sgemm.cpp +776 -70
- package/cpp/sgemm.h +14 -14
- package/cpp/unicode.cpp +6 -0
- package/ios/RNLlama.mm +47 -0
- package/ios/RNLlamaContext.h +3 -1
- package/ios/RNLlamaContext.mm +71 -14
- package/jest/mock.js +15 -3
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +33 -37
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +31 -35
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +26 -6
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +21 -36
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +4 -18
- package/package.json +2 -3
- package/src/NativeRNLlama.ts +32 -13
- package/src/index.ts +52 -47
package/cpp/llama.h
CHANGED
@@ -35,7 +35,6 @@
|
|
35
35
|
|
36
36
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
37
37
|
|
38
|
-
// TODO: use everywhere in the implementation
|
39
38
|
#define LLAMA_TOKEN_NULL -1
|
40
39
|
|
41
40
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
@@ -106,6 +105,7 @@ extern "C" {
|
|
106
105
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
107
106
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
108
107
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
108
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
109
109
|
};
|
110
110
|
|
111
111
|
enum llama_rope_type {
|
@@ -386,6 +386,7 @@ extern "C" {
|
|
386
386
|
} llama_chat_message;
|
387
387
|
|
388
388
|
// lora adapter
|
389
|
+
// TODO: rename to llama_adapter_lora
|
389
390
|
struct llama_lora_adapter;
|
390
391
|
|
391
392
|
// Helpers for getting default parameters
|
@@ -413,11 +414,19 @@ extern "C" {
|
|
413
414
|
// Call once at the end of the program - currently only used for MPI
|
414
415
|
LLAMA_API void llama_backend_free(void);
|
415
416
|
|
416
|
-
LLAMA_API struct llama_model * llama_load_model_from_file(
|
417
|
+
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
418
|
+
const char * path_model,
|
419
|
+
struct llama_model_params params),
|
420
|
+
"use llama_model_load_from_file instead");
|
421
|
+
|
422
|
+
LLAMA_API struct llama_model * llama_model_load_from_file(
|
417
423
|
const char * path_model,
|
418
424
|
struct llama_model_params params);
|
419
425
|
|
420
|
-
LLAMA_API void llama_free_model(struct llama_model * model)
|
426
|
+
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
427
|
+
"use llama_model_free instead");
|
428
|
+
|
429
|
+
LLAMA_API void llama_model_free(struct llama_model * model);
|
421
430
|
|
422
431
|
// TODO: rename to llama_init_from_model
|
423
432
|
LLAMA_API struct llama_context * llama_new_context_with_model(
|
@@ -502,14 +511,19 @@ extern "C" {
|
|
502
511
|
const char * fname_out,
|
503
512
|
const llama_model_quantize_params * params);
|
504
513
|
|
514
|
+
//
|
515
|
+
// Adapters
|
516
|
+
//
|
517
|
+
|
505
518
|
// Load a LoRA adapter from file
|
506
|
-
//
|
519
|
+
// TODO: rename to llama_adapter_lora_init
|
507
520
|
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
|
508
521
|
struct llama_model * model,
|
509
522
|
const char * path_lora);
|
510
523
|
|
511
524
|
// Add a loaded LoRA adapter to given context
|
512
525
|
// This will not modify model's weight
|
526
|
+
// TODO: rename to llama_set_adapter_lora
|
513
527
|
LLAMA_API int32_t llama_lora_adapter_set(
|
514
528
|
struct llama_context * ctx,
|
515
529
|
struct llama_lora_adapter * adapter,
|
@@ -517,16 +531,18 @@ extern "C" {
|
|
517
531
|
|
518
532
|
// Remove a specific LoRA adapter from given context
|
519
533
|
// Return -1 if the adapter is not present in the context
|
534
|
+
// TODO: rename to llama_rm_adapter_lora
|
520
535
|
LLAMA_API int32_t llama_lora_adapter_remove(
|
521
536
|
struct llama_context * ctx,
|
522
537
|
struct llama_lora_adapter * adapter);
|
523
538
|
|
524
539
|
// Remove all LoRA adapters from given context
|
525
|
-
|
526
|
-
|
540
|
+
// TODO: rename to llama_clear_adapter_lora
|
541
|
+
LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
|
527
542
|
|
528
543
|
// Manually free a LoRA adapter
|
529
544
|
// Note: loaded adapters will be free when the associated model is deleted
|
545
|
+
// TODO: rename to llama_adapter_lora_free
|
530
546
|
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
531
547
|
|
532
548
|
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
@@ -535,6 +551,7 @@ extern "C" {
|
|
535
551
|
// to an n_embd x n_layers buffer starting from layer 1.
|
536
552
|
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
537
553
|
// See llama_control_vector_load in common to load a control vector.
|
554
|
+
// TODO: rename to llama_adapter_cvec_apply
|
538
555
|
LLAMA_API int32_t llama_control_vector_apply(
|
539
556
|
struct llama_context * lctx,
|
540
557
|
const float * data,
|
@@ -547,6 +564,8 @@ extern "C" {
|
|
547
564
|
// KV cache
|
548
565
|
//
|
549
566
|
|
567
|
+
// TODO: remove llama_kv_cache_view_* API
|
568
|
+
|
550
569
|
// Information associated with an individual cell in the KV cache view.
|
551
570
|
struct llama_kv_cache_view_cell {
|
552
571
|
// The position for this cell. Takes KV cache shifts into account.
|
@@ -593,8 +612,11 @@ extern "C" {
|
|
593
612
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
594
613
|
|
595
614
|
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
615
|
+
// TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
|
596
616
|
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
597
617
|
|
618
|
+
///
|
619
|
+
|
598
620
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
599
621
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
600
622
|
LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
@@ -664,6 +686,9 @@ extern "C" {
|
|
664
686
|
struct llama_context * ctx,
|
665
687
|
llama_seq_id seq_id);
|
666
688
|
|
689
|
+
// TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
|
690
|
+
// how to avoid this?
|
691
|
+
|
667
692
|
// Defragment the KV cache
|
668
693
|
// This will be applied:
|
669
694
|
// - lazily on next llama_decode()
|
package/cpp/rn-llama.hpp
CHANGED
@@ -5,64 +5,35 @@
|
|
5
5
|
#include <iostream>
|
6
6
|
#include "common.h"
|
7
7
|
#include "ggml.h"
|
8
|
+
#include "gguf.h"
|
8
9
|
#include "llama.h"
|
9
10
|
#include "llama-impl.h"
|
10
11
|
#include "sampling.h"
|
12
|
+
#if defined(__ANDROID__)
|
13
|
+
#include <android/log.h>
|
14
|
+
#endif
|
11
15
|
|
12
16
|
namespace rnllama {
|
13
17
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
case LM_GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
26
|
-
case LM_GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
27
|
-
default: return "unknown type: " + std::to_string(type);
|
28
|
-
}
|
29
|
-
}
|
30
|
-
|
31
|
-
static std::string lm_gguf_kv_to_str(const struct lm_gguf_context * ctx_gguf, int i) {
|
32
|
-
const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i);
|
18
|
+
const std::vector<lm_ggml_type> kv_cache_types = {
|
19
|
+
LM_GGML_TYPE_F32,
|
20
|
+
LM_GGML_TYPE_F16,
|
21
|
+
LM_GGML_TYPE_BF16,
|
22
|
+
LM_GGML_TYPE_Q8_0,
|
23
|
+
LM_GGML_TYPE_Q4_0,
|
24
|
+
LM_GGML_TYPE_Q4_1,
|
25
|
+
LM_GGML_TYPE_IQ4_NL,
|
26
|
+
LM_GGML_TYPE_Q5_0,
|
27
|
+
LM_GGML_TYPE_Q5_1,
|
28
|
+
};
|
33
29
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
const enum lm_gguf_type arr_type = lm_gguf_get_arr_type(ctx_gguf, i);
|
40
|
-
int arr_n = lm_gguf_get_arr_n(ctx_gguf, i);
|
41
|
-
const void * data = lm_gguf_get_arr_data(ctx_gguf, i);
|
42
|
-
std::stringstream ss;
|
43
|
-
ss << "[";
|
44
|
-
for (int j = 0; j < arr_n; j++) {
|
45
|
-
if (arr_type == LM_GGUF_TYPE_STRING) {
|
46
|
-
std::string val = lm_gguf_get_arr_str(ctx_gguf, i, j);
|
47
|
-
// escape quotes
|
48
|
-
replace_all(val, "\\", "\\\\");
|
49
|
-
replace_all(val, "\"", "\\\"");
|
50
|
-
ss << '"' << val << '"';
|
51
|
-
} else if (arr_type == LM_GGUF_TYPE_ARRAY) {
|
52
|
-
ss << "???";
|
53
|
-
} else {
|
54
|
-
ss << lm_gguf_data_to_str(arr_type, data, j);
|
55
|
-
}
|
56
|
-
if (j < arr_n - 1) {
|
57
|
-
ss << ", ";
|
58
|
-
}
|
59
|
-
}
|
60
|
-
ss << "]";
|
61
|
-
return ss.str();
|
62
|
-
}
|
63
|
-
default:
|
64
|
-
return lm_gguf_data_to_str(type, lm_gguf_get_val_data(ctx_gguf, i), 0);
|
30
|
+
static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
|
31
|
+
for (const auto & type : kv_cache_types) {
|
32
|
+
if (lm_ggml_type_name(type) == s) {
|
33
|
+
return type;
|
34
|
+
}
|
65
35
|
}
|
36
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
66
37
|
}
|
67
38
|
|
68
39
|
static void llama_batch_clear(llama_batch *batch) {
|
@@ -86,16 +57,32 @@ static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
|
|
86
57
|
static void log(const char *level, const char *function, int line,
|
87
58
|
const char *format, ...)
|
88
59
|
{
|
89
|
-
printf("[%s] %s:%d ", level, function, line);
|
90
|
-
|
91
60
|
va_list args;
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
61
|
+
#if defined(__ANDROID__)
|
62
|
+
char prefix[256];
|
63
|
+
snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
|
64
|
+
|
65
|
+
va_start(args, format);
|
66
|
+
android_LogPriority priority;
|
67
|
+
if (strcmp(level, "ERROR") == 0) {
|
68
|
+
priority = ANDROID_LOG_ERROR;
|
69
|
+
} else if (strcmp(level, "WARNING") == 0) {
|
70
|
+
priority = ANDROID_LOG_WARN;
|
71
|
+
} else if (strcmp(level, "INFO") == 0) {
|
72
|
+
priority = ANDROID_LOG_INFO;
|
73
|
+
} else {
|
74
|
+
priority = ANDROID_LOG_DEBUG;
|
75
|
+
}
|
76
|
+
__android_log_vprint(priority, "RNLlama", prefix, args);
|
77
|
+
va_end(args);
|
78
|
+
#else
|
79
|
+
printf("[%s] %s:%d ", level, function, line);
|
80
|
+
va_start(args, format);
|
81
|
+
vprintf(format, args);
|
82
|
+
va_end(args);
|
83
|
+
printf("\n");
|
84
|
+
#endif
|
97
85
|
}
|
98
|
-
|
99
86
|
static bool rnllama_verbose = false;
|
100
87
|
|
101
88
|
#if RNLLAMA_VERBOSE != 1
|
@@ -187,7 +174,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
|
|
187
174
|
}
|
188
175
|
|
189
176
|
template <class Iter>
|
190
|
-
static std::string tokens_to_str(llama_context
|
177
|
+
static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end)
|
191
178
|
{
|
192
179
|
std::string ret;
|
193
180
|
for (; begin != end; ++begin)
|
@@ -214,6 +201,8 @@ struct llama_rn_context
|
|
214
201
|
|
215
202
|
common_params params;
|
216
203
|
|
204
|
+
common_init_result llama_init;
|
205
|
+
|
217
206
|
llama_model *model = nullptr;
|
218
207
|
float loading_progress = 0;
|
219
208
|
bool is_load_interrupted = false;
|
@@ -230,18 +219,10 @@ struct llama_rn_context
|
|
230
219
|
std::string stopping_word;
|
231
220
|
bool incomplete = false;
|
232
221
|
|
222
|
+
std::vector<common_lora_adapter_info> lora;
|
223
|
+
|
233
224
|
~llama_rn_context()
|
234
225
|
{
|
235
|
-
if (ctx)
|
236
|
-
{
|
237
|
-
llama_free(ctx);
|
238
|
-
ctx = nullptr;
|
239
|
-
}
|
240
|
-
if (model)
|
241
|
-
{
|
242
|
-
llama_free_model(model);
|
243
|
-
model = nullptr;
|
244
|
-
}
|
245
226
|
if (ctx_sampling != nullptr)
|
246
227
|
{
|
247
228
|
common_sampler_free(ctx_sampling);
|
@@ -280,30 +261,26 @@ struct llama_rn_context
|
|
280
261
|
bool loadModel(common_params ¶ms_)
|
281
262
|
{
|
282
263
|
params = params_;
|
283
|
-
|
284
|
-
model =
|
285
|
-
ctx =
|
264
|
+
llama_init = common_init_from_params(params);
|
265
|
+
model = llama_init.model.get();
|
266
|
+
ctx = llama_init.context.get();
|
286
267
|
if (model == nullptr)
|
287
268
|
{
|
288
269
|
LOG_ERROR("unable to load model: %s", params_.model.c_str());
|
289
270
|
return false;
|
290
271
|
}
|
291
|
-
LOG_VERBOSE("getting n_ctx");
|
292
272
|
n_ctx = llama_n_ctx(ctx);
|
273
|
+
|
274
|
+
// We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
|
275
|
+
// LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
|
276
|
+
|
293
277
|
return true;
|
294
278
|
}
|
295
279
|
|
296
280
|
bool validateModelChatTemplate() const {
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
if (res >= 0) {
|
301
|
-
llama_chat_message chat[] = {{"user", "test"}};
|
302
|
-
std::string tmpl = std::string(model_template.data(), model_template.size());
|
303
|
-
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
304
|
-
return chat_res > 0;
|
305
|
-
}
|
306
|
-
return res > 0;
|
281
|
+
llama_chat_message chat[] = {{"user", "test"}};
|
282
|
+
int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
|
283
|
+
return chat_res > 0;
|
307
284
|
}
|
308
285
|
|
309
286
|
void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
|
@@ -330,7 +307,7 @@ struct llama_rn_context
|
|
330
307
|
|
331
308
|
void loadPrompt()
|
332
309
|
{
|
333
|
-
std::vector<llama_token> prompt_tokens = ::common_tokenize(
|
310
|
+
std::vector<llama_token> prompt_tokens = ::common_tokenize(model, params.prompt, true, true);
|
334
311
|
num_prompt_tokens = prompt_tokens.size();
|
335
312
|
|
336
313
|
// LOG tokens
|
@@ -439,6 +416,7 @@ struct llama_rn_context
|
|
439
416
|
}
|
440
417
|
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
|
441
418
|
{
|
419
|
+
|
442
420
|
LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
|
443
421
|
n_eval,
|
444
422
|
n_past,
|
@@ -477,7 +455,7 @@ struct llama_rn_context
|
|
477
455
|
const int32_t n_probs = params.sampling.n_probs;
|
478
456
|
|
479
457
|
// deprecated
|
480
|
-
/*if (params.
|
458
|
+
/*if (params.sampling.temp <= 0 && n_probs > 0)
|
481
459
|
{
|
482
460
|
// For llama_sample_token_greedy we need to sort candidates
|
483
461
|
llama_sampler_init_softmax();
|
@@ -647,7 +625,11 @@ struct llama_rn_context
|
|
647
625
|
double tg_std = 0;
|
648
626
|
|
649
627
|
// TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
|
650
|
-
llama_batch batch = llama_batch_init(
|
628
|
+
llama_batch batch = llama_batch_init(
|
629
|
+
std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch
|
630
|
+
0, // No embeddings
|
631
|
+
1 // Single sequence
|
632
|
+
);
|
651
633
|
|
652
634
|
for (int i = 0; i < nr; i++)
|
653
635
|
{
|
@@ -734,7 +716,27 @@ struct llama_rn_context
|
|
734
716
|
std::string("]");
|
735
717
|
}
|
736
718
|
|
737
|
-
|
719
|
+
int applyLoraAdapters(std::vector<common_lora_adapter_info> lora) {
|
720
|
+
for (auto &la : lora) {
|
721
|
+
la.ptr = llama_lora_adapter_init(model, la.path.c_str());
|
722
|
+
if (la.ptr == nullptr) {
|
723
|
+
LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
|
724
|
+
return -1;
|
725
|
+
}
|
726
|
+
}
|
727
|
+
this->lora = lora;
|
728
|
+
common_lora_adapters_apply(ctx, lora);
|
729
|
+
return 0;
|
730
|
+
}
|
731
|
+
|
732
|
+
void removeLoraAdapters() {
|
733
|
+
this->lora.clear();
|
734
|
+
common_lora_adapters_apply(ctx, this->lora); // apply empty list
|
735
|
+
}
|
736
|
+
|
737
|
+
std::vector<common_lora_adapter_info> getLoadedLoraAdapters() {
|
738
|
+
return this->lora;
|
739
|
+
}
|
738
740
|
// Context Shifting from KoboldCpp <https://github.com/LostRuins/koboldcpp>
|
739
741
|
// Implementation obtained with special permission from @concedo
|
740
742
|
|
@@ -897,6 +899,7 @@ void purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context
|
|
897
899
|
}
|
898
900
|
|
899
901
|
// End Context Shifting
|
902
|
+
|
900
903
|
};
|
901
904
|
|
902
905
|
}
|