cui-llama.rn 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +14 -8
- package/android/src/main/jni.cpp +38 -37
- package/cpp/common.cpp +43 -26
- package/cpp/common.h +18 -11
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +5 -2
- package/cpp/ggml-cpp.h +1 -0
- package/cpp/ggml-cpu-aarch64.cpp +6 -1
- package/cpp/ggml-cpu-quants.c +5 -1
- package/cpp/ggml-impl.h +11 -16
- package/cpp/ggml-metal.m +2 -2
- package/cpp/ggml.c +0 -1276
- package/cpp/ggml.h +0 -140
- package/cpp/gguf.cpp +1325 -0
- package/cpp/gguf.h +202 -0
- package/cpp/llama-adapter.cpp +346 -0
- package/cpp/llama-adapter.h +73 -0
- package/cpp/llama-arch.cpp +1434 -0
- package/cpp/llama-arch.h +395 -0
- package/cpp/llama-batch.cpp +368 -0
- package/cpp/llama-batch.h +88 -0
- package/cpp/llama-chat.cpp +567 -0
- package/cpp/llama-chat.h +51 -0
- package/cpp/llama-context.cpp +1771 -0
- package/cpp/llama-context.h +128 -0
- package/cpp/llama-cparams.cpp +1 -0
- package/cpp/llama-cparams.h +37 -0
- package/cpp/llama-cpp.h +30 -0
- package/cpp/llama-grammar.cpp +1 -0
- package/cpp/llama-grammar.h +3 -1
- package/cpp/llama-hparams.cpp +71 -0
- package/cpp/llama-hparams.h +140 -0
- package/cpp/llama-impl.cpp +167 -0
- package/cpp/llama-impl.h +16 -136
- package/cpp/llama-kv-cache.cpp +718 -0
- package/cpp/llama-kv-cache.h +218 -0
- package/cpp/llama-mmap.cpp +589 -0
- package/cpp/llama-mmap.h +67 -0
- package/cpp/llama-model-loader.cpp +1011 -0
- package/cpp/llama-model-loader.h +158 -0
- package/cpp/llama-model.cpp +2202 -0
- package/cpp/llama-model.h +391 -0
- package/cpp/llama-sampling.cpp +117 -4
- package/cpp/llama-vocab.cpp +21 -28
- package/cpp/llama-vocab.h +13 -1
- package/cpp/llama.cpp +8437 -19421
- package/cpp/llama.cpp.rej +23 -0
- package/cpp/llama.h +31 -6
- package/cpp/rn-llama.hpp +39 -37
- package/cpp/sgemm.cpp +776 -70
- package/cpp/unicode.cpp +6 -0
- package/package.json +1 -1
@@ -0,0 +1,23 @@
|
|
1
|
+
--- llama.cpp.orig 2024-11-02 12:42:13
|
2
|
+
+++ llama.cpp 2024-11-02 13:00:37
|
3
|
+
@@ -1941,16 +1952,16 @@
|
4
|
+
|
5
|
+
if (prefetch > 0) {
|
6
|
+
// advise the kernel to preload the mapped memory
|
7
|
+
- if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
8
|
+
- LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
9
|
+
+ if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
10
|
+
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
11
|
+
strerror(errno));
|
12
|
+
}
|
13
|
+
}
|
14
|
+
if (numa) {
|
15
|
+
// advise the kernel not to use readahead
|
16
|
+
// (because the next page might not belong on the same node)
|
17
|
+
- if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
18
|
+
- LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
19
|
+
+ if (madvise(addr, file->size, MADV_RANDOM)) {
|
20
|
+
+ fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
21
|
+
strerror(errno));
|
22
|
+
}
|
23
|
+
}
|
package/cpp/llama.h
CHANGED
@@ -35,7 +35,6 @@
|
|
35
35
|
|
36
36
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
37
37
|
|
38
|
-
// TODO: use everywhere in the implementation
|
39
38
|
#define LLAMA_TOKEN_NULL -1
|
40
39
|
|
41
40
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
@@ -106,6 +105,7 @@ extern "C" {
|
|
106
105
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
107
106
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
108
107
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
108
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
109
109
|
};
|
110
110
|
|
111
111
|
enum llama_rope_type {
|
@@ -386,6 +386,7 @@ extern "C" {
|
|
386
386
|
} llama_chat_message;
|
387
387
|
|
388
388
|
// lora adapter
|
389
|
+
// TODO: rename to llama_adapter_lora
|
389
390
|
struct llama_lora_adapter;
|
390
391
|
|
391
392
|
// Helpers for getting default parameters
|
@@ -413,11 +414,19 @@ extern "C" {
|
|
413
414
|
// Call once at the end of the program - currently only used for MPI
|
414
415
|
LLAMA_API void llama_backend_free(void);
|
415
416
|
|
416
|
-
LLAMA_API struct llama_model * llama_load_model_from_file(
|
417
|
+
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
418
|
+
const char * path_model,
|
419
|
+
struct llama_model_params params),
|
420
|
+
"use llama_model_load_from_file instead");
|
421
|
+
|
422
|
+
LLAMA_API struct llama_model * llama_model_load_from_file(
|
417
423
|
const char * path_model,
|
418
424
|
struct llama_model_params params);
|
419
425
|
|
420
|
-
LLAMA_API void llama_free_model(struct llama_model * model)
|
426
|
+
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
427
|
+
"use llama_model_free instead");
|
428
|
+
|
429
|
+
LLAMA_API void llama_model_free(struct llama_model * model);
|
421
430
|
|
422
431
|
// TODO: rename to llama_init_from_model
|
423
432
|
LLAMA_API struct llama_context * llama_new_context_with_model(
|
@@ -502,14 +511,19 @@ extern "C" {
|
|
502
511
|
const char * fname_out,
|
503
512
|
const llama_model_quantize_params * params);
|
504
513
|
|
514
|
+
//
|
515
|
+
// Adapters
|
516
|
+
//
|
517
|
+
|
505
518
|
// Load a LoRA adapter from file
|
506
|
-
//
|
519
|
+
// TODO: rename to llama_adapter_lora_init
|
507
520
|
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
|
508
521
|
struct llama_model * model,
|
509
522
|
const char * path_lora);
|
510
523
|
|
511
524
|
// Add a loaded LoRA adapter to given context
|
512
525
|
// This will not modify model's weight
|
526
|
+
// TODO: rename to llama_set_adapter_lora
|
513
527
|
LLAMA_API int32_t llama_lora_adapter_set(
|
514
528
|
struct llama_context * ctx,
|
515
529
|
struct llama_lora_adapter * adapter,
|
@@ -517,16 +531,18 @@ extern "C" {
|
|
517
531
|
|
518
532
|
// Remove a specific LoRA adapter from given context
|
519
533
|
// Return -1 if the adapter is not present in the context
|
534
|
+
// TODO: rename to llama_rm_adapter_lora
|
520
535
|
LLAMA_API int32_t llama_lora_adapter_remove(
|
521
536
|
struct llama_context * ctx,
|
522
537
|
struct llama_lora_adapter * adapter);
|
523
538
|
|
524
539
|
// Remove all LoRA adapters from given context
|
525
|
-
|
526
|
-
|
540
|
+
// TODO: rename to llama_clear_adapter_lora
|
541
|
+
LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
|
527
542
|
|
528
543
|
// Manually free a LoRA adapter
|
529
544
|
// Note: loaded adapters will be free when the associated model is deleted
|
545
|
+
// TODO: rename to llama_adapter_lora_free
|
530
546
|
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
531
547
|
|
532
548
|
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
@@ -535,6 +551,7 @@ extern "C" {
|
|
535
551
|
// to an n_embd x n_layers buffer starting from layer 1.
|
536
552
|
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
537
553
|
// See llama_control_vector_load in common to load a control vector.
|
554
|
+
// TODO: rename to llama_adapter_cvec_apply
|
538
555
|
LLAMA_API int32_t llama_control_vector_apply(
|
539
556
|
struct llama_context * lctx,
|
540
557
|
const float * data,
|
@@ -547,6 +564,8 @@ extern "C" {
|
|
547
564
|
// KV cache
|
548
565
|
//
|
549
566
|
|
567
|
+
// TODO: remove llama_kv_cache_view_* API
|
568
|
+
|
550
569
|
// Information associated with an individual cell in the KV cache view.
|
551
570
|
struct llama_kv_cache_view_cell {
|
552
571
|
// The position for this cell. Takes KV cache shifts into account.
|
@@ -593,8 +612,11 @@ extern "C" {
|
|
593
612
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
594
613
|
|
595
614
|
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
615
|
+
// TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
|
596
616
|
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
597
617
|
|
618
|
+
///
|
619
|
+
|
598
620
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
599
621
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
600
622
|
LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
@@ -664,6 +686,9 @@ extern "C" {
|
|
664
686
|
struct llama_context * ctx,
|
665
687
|
llama_seq_id seq_id);
|
666
688
|
|
689
|
+
// TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
|
690
|
+
// how to avoid this?
|
691
|
+
|
667
692
|
// Defragment the KV cache
|
668
693
|
// This will be applied:
|
669
694
|
// - lazily on next llama_decode()
|
package/cpp/rn-llama.hpp
CHANGED
@@ -8,6 +8,7 @@
|
|
8
8
|
#include "llama.h"
|
9
9
|
#include "llama-impl.h"
|
10
10
|
#include "sampling.h"
|
11
|
+
#include "llama-cpp.h"
|
11
12
|
|
12
13
|
namespace rnllama {
|
13
14
|
|
@@ -187,7 +188,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
|
|
187
188
|
}
|
188
189
|
|
189
190
|
template <class Iter>
|
190
|
-
static std::string tokens_to_str(llama_context
|
191
|
+
static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end)
|
191
192
|
{
|
192
193
|
std::string ret;
|
193
194
|
for (; begin != end; ++begin)
|
@@ -214,11 +215,11 @@ struct llama_rn_context
|
|
214
215
|
|
215
216
|
common_params params;
|
216
217
|
|
217
|
-
|
218
|
+
llama_model_ptr model = nullptr;
|
218
219
|
float loading_progress = 0;
|
219
220
|
bool is_load_interrupted = false;
|
220
221
|
|
221
|
-
|
222
|
+
llama_context_ptr ctx = nullptr;
|
222
223
|
common_sampler *ctx_sampling = nullptr;
|
223
224
|
|
224
225
|
int n_ctx;
|
@@ -234,12 +235,12 @@ struct llama_rn_context
|
|
234
235
|
{
|
235
236
|
if (ctx)
|
236
237
|
{
|
237
|
-
llama_free(ctx);
|
238
|
+
llama_free(ctx.get());
|
238
239
|
ctx = nullptr;
|
239
240
|
}
|
240
241
|
if (model)
|
241
242
|
{
|
242
|
-
|
243
|
+
llama_model_free(model.get());
|
243
244
|
model = nullptr;
|
244
245
|
}
|
245
246
|
if (ctx_sampling != nullptr)
|
@@ -273,7 +274,7 @@ struct llama_rn_context
|
|
273
274
|
if (ctx_sampling != nullptr) {
|
274
275
|
common_sampler_free(ctx_sampling);
|
275
276
|
}
|
276
|
-
ctx_sampling = common_sampler_init(model, params.sampling);
|
277
|
+
ctx_sampling = common_sampler_init(model.get(), params.sampling);
|
277
278
|
return ctx_sampling != nullptr;
|
278
279
|
}
|
279
280
|
|
@@ -281,26 +282,26 @@ struct llama_rn_context
|
|
281
282
|
{
|
282
283
|
params = params_;
|
283
284
|
common_init_result result = common_init_from_params(params);
|
284
|
-
model = result.model;
|
285
|
-
ctx = result.context;
|
285
|
+
model = std::move(result.model);
|
286
|
+
ctx = std::move(result.context);
|
286
287
|
if (model == nullptr)
|
287
288
|
{
|
288
289
|
LOG_ERROR("unable to load model: %s", params_.model.c_str());
|
289
290
|
return false;
|
290
291
|
}
|
291
292
|
LOG_VERBOSE("getting n_ctx");
|
292
|
-
n_ctx = llama_n_ctx(ctx);
|
293
|
+
n_ctx = llama_n_ctx(ctx.get());
|
293
294
|
return true;
|
294
295
|
}
|
295
296
|
|
296
297
|
bool validateModelChatTemplate() const {
|
297
298
|
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
298
299
|
std::string template_key = "tokenizer.chat_template";
|
299
|
-
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
300
|
+
int32_t res = llama_model_meta_val_str(model.get(), template_key.c_str(), model_template.data(), model_template.size());
|
300
301
|
if (res >= 0) {
|
301
302
|
llama_chat_message chat[] = {{"user", "test"}};
|
302
303
|
std::string tmpl = std::string(model_template.data(), model_template.size());
|
303
|
-
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
304
|
+
int32_t chat_res = llama_chat_apply_template(model.get(), tmpl.c_str(), chat, 1, true, nullptr, 0);
|
304
305
|
return chat_res > 0;
|
305
306
|
}
|
306
307
|
return res > 0;
|
@@ -330,7 +331,7 @@ struct llama_rn_context
|
|
330
331
|
|
331
332
|
void loadPrompt()
|
332
333
|
{
|
333
|
-
std::vector<llama_token> prompt_tokens = ::common_tokenize(
|
334
|
+
std::vector<llama_token> prompt_tokens = ::common_tokenize(model.get(), params.prompt, true, true);
|
334
335
|
num_prompt_tokens = prompt_tokens.size();
|
335
336
|
|
336
337
|
// LOG tokens
|
@@ -358,7 +359,7 @@ struct llama_rn_context
|
|
358
359
|
|
359
360
|
// do Context Shift , may be buggy! TODO: Verify functionality
|
360
361
|
if(!params.embedding){
|
361
|
-
purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
|
362
|
+
purge_missing_tokens(ctx.get(), embd, prompt_tokens, params.n_predict, params.n_ctx);
|
362
363
|
}
|
363
364
|
|
364
365
|
// push the prompt into the sampling context (do not apply grammar)
|
@@ -379,7 +380,7 @@ struct llama_rn_context
|
|
379
380
|
}
|
380
381
|
|
381
382
|
// since #3228 we now have to manually manage the KV cache
|
382
|
-
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
383
|
+
llama_kv_cache_seq_rm(ctx.get(), 0, n_past, -1);
|
383
384
|
|
384
385
|
LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
|
385
386
|
n_past,
|
@@ -394,7 +395,7 @@ struct llama_rn_context
|
|
394
395
|
{
|
395
396
|
// number of tokens to keep when resetting context
|
396
397
|
n_remain = params.n_predict;
|
397
|
-
llama_perf_context_reset(ctx);
|
398
|
+
llama_perf_context_reset(ctx.get());
|
398
399
|
is_predicting = true;
|
399
400
|
}
|
400
401
|
|
@@ -410,8 +411,8 @@ struct llama_rn_context
|
|
410
411
|
const int n_left = n_past - params.n_keep - 1;
|
411
412
|
const int n_discard = n_left/2;
|
412
413
|
|
413
|
-
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
414
|
-
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
414
|
+
llama_kv_cache_seq_rm (ctx.get(), 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
415
|
+
llama_kv_cache_seq_add(ctx.get(), 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
415
416
|
|
416
417
|
for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
|
417
418
|
{
|
@@ -437,13 +438,14 @@ struct llama_rn_context
|
|
437
438
|
{
|
438
439
|
n_eval = params.n_batch;
|
439
440
|
}
|
440
|
-
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
|
441
|
+
if (llama_decode(ctx.get(), llama_batch_get_one(&embd[n_past], n_eval)))
|
441
442
|
{
|
443
|
+
|
442
444
|
LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
|
443
445
|
n_eval,
|
444
446
|
n_past,
|
445
447
|
params.cpuparams.n_threads,
|
446
|
-
tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
|
448
|
+
tokens_to_str(ctx.get(), embd.cbegin() + n_past, embd.cend()).c_str()
|
447
449
|
);
|
448
450
|
has_next_token = false;
|
449
451
|
return result;
|
@@ -461,16 +463,16 @@ struct llama_rn_context
|
|
461
463
|
if (params.n_predict == 0)
|
462
464
|
{
|
463
465
|
has_next_token = false;
|
464
|
-
result.tok = llama_token_eos(model);
|
466
|
+
result.tok = llama_token_eos(model.get());
|
465
467
|
return result;
|
466
468
|
}
|
467
469
|
|
468
470
|
{
|
469
471
|
// out of user input, sample next token
|
470
472
|
std::vector<llama_token_data> candidates;
|
471
|
-
candidates.reserve(llama_n_vocab(model));
|
473
|
+
candidates.reserve(llama_n_vocab(model.get()));
|
472
474
|
|
473
|
-
result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
|
475
|
+
result.tok = common_sampler_sample(ctx_sampling, ctx.get(), -1);
|
474
476
|
|
475
477
|
llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
|
476
478
|
|
@@ -501,7 +503,7 @@ struct llama_rn_context
|
|
501
503
|
// decrement remaining sampling budget
|
502
504
|
--n_remain;
|
503
505
|
|
504
|
-
if (!embd.empty() && embd.back() == llama_token_eos(model))
|
506
|
+
if (!embd.empty() && embd.back() == llama_token_eos(model.get()))
|
505
507
|
{
|
506
508
|
// stopping_word = llama_token_to_piece(ctx, embd.back());
|
507
509
|
has_next_token = false;
|
@@ -550,7 +552,7 @@ struct llama_rn_context
|
|
550
552
|
{
|
551
553
|
const completion_token_output token_with_probs = nextToken();
|
552
554
|
|
553
|
-
const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
|
555
|
+
const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx.get(), token_with_probs.tok);
|
554
556
|
generated_text += token_text;
|
555
557
|
|
556
558
|
if (params.sampling.n_probs > 0)
|
@@ -606,7 +608,7 @@ struct llama_rn_context
|
|
606
608
|
|
607
609
|
std::vector<float> getEmbedding(common_params &embd_params)
|
608
610
|
{
|
609
|
-
static const int n_embd = llama_n_embd(llama_get_model(ctx));
|
611
|
+
static const int n_embd = llama_n_embd(llama_get_model(ctx.get()));
|
610
612
|
if (!embd_params.embedding)
|
611
613
|
{
|
612
614
|
LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
|
@@ -614,12 +616,12 @@ struct llama_rn_context
|
|
614
616
|
}
|
615
617
|
float *data;
|
616
618
|
|
617
|
-
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
619
|
+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx.get());
|
618
620
|
printf("pooling_type: %d\n", pooling_type);
|
619
621
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
620
|
-
data = llama_get_embeddings(ctx);
|
622
|
+
data = llama_get_embeddings(ctx.get());
|
621
623
|
} else {
|
622
|
-
data = llama_get_embeddings_seq(ctx, 0);
|
624
|
+
data = llama_get_embeddings_seq(ctx.get(), 0);
|
623
625
|
}
|
624
626
|
|
625
627
|
if (!data) {
|
@@ -661,15 +663,15 @@ struct llama_rn_context
|
|
661
663
|
}
|
662
664
|
batch.logits[batch.n_tokens - 1] = 1; // true
|
663
665
|
|
664
|
-
llama_kv_cache_clear(ctx);
|
666
|
+
llama_kv_cache_clear(ctx.get());
|
665
667
|
|
666
668
|
const int64_t t_pp_start = llama_time_us();
|
667
|
-
if (llama_decode(ctx, batch) != 0)
|
669
|
+
if (llama_decode(ctx.get(), batch) != 0)
|
668
670
|
{
|
669
671
|
LOG_ERROR("llama_decode() failed during prompt", "");
|
670
672
|
}
|
671
673
|
const int64_t t_pp_end = llama_time_us();
|
672
|
-
llama_kv_cache_clear(ctx);
|
674
|
+
llama_kv_cache_clear(ctx.get());
|
673
675
|
|
674
676
|
if (is_interrupted) break;
|
675
677
|
|
@@ -684,7 +686,7 @@ struct llama_rn_context
|
|
684
686
|
llama_batch_add(&batch, 0, i, {j}, true);
|
685
687
|
}
|
686
688
|
|
687
|
-
if (llama_decode(ctx, batch) != 0)
|
689
|
+
if (llama_decode(ctx.get(), batch) != 0)
|
688
690
|
{
|
689
691
|
LOG_ERROR("llama_decode() failed during text generation", "");
|
690
692
|
}
|
@@ -693,7 +695,7 @@ struct llama_rn_context
|
|
693
695
|
|
694
696
|
const int64_t t_tg_end = llama_time_us();
|
695
697
|
|
696
|
-
llama_kv_cache_clear(ctx);
|
698
|
+
llama_kv_cache_clear(ctx.get());
|
697
699
|
|
698
700
|
const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
|
699
701
|
const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
|
@@ -719,14 +721,14 @@ struct llama_rn_context
|
|
719
721
|
tg_std = 0;
|
720
722
|
}
|
721
723
|
|
722
|
-
if (is_interrupted) llama_kv_cache_clear(ctx);
|
724
|
+
if (is_interrupted) llama_kv_cache_clear(ctx.get());
|
723
725
|
is_predicting = false;
|
724
726
|
|
725
727
|
char model_desc[128];
|
726
|
-
llama_model_desc(model, model_desc, sizeof(model_desc));
|
728
|
+
llama_model_desc(model.get(), model_desc, sizeof(model_desc));
|
727
729
|
return std::string("[\"") + model_desc + std::string("\",") +
|
728
|
-
std::to_string(llama_model_size(model)) + std::string(",") +
|
729
|
-
std::to_string(llama_model_n_params(model)) + std::string(",") +
|
730
|
+
std::to_string(llama_model_size(model.get())) + std::string(",") +
|
731
|
+
std::to_string(llama_model_n_params(model.get())) + std::string(",") +
|
730
732
|
std::to_string(pp_avg) + std::string(",") +
|
731
733
|
std::to_string(pp_std) + std::string(",") +
|
732
734
|
std::to_string(tg_avg) + std::string(",") +
|