cui-llama.rn 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/android/src/main/CMakeLists.txt +14 -8
  2. package/android/src/main/jni.cpp +38 -37
  3. package/cpp/common.cpp +43 -26
  4. package/cpp/common.h +18 -11
  5. package/cpp/ggml-backend-reg.cpp +5 -0
  6. package/cpp/ggml-backend.cpp +5 -2
  7. package/cpp/ggml-cpp.h +1 -0
  8. package/cpp/ggml-cpu-aarch64.cpp +6 -1
  9. package/cpp/ggml-cpu-quants.c +5 -1
  10. package/cpp/ggml-impl.h +11 -16
  11. package/cpp/ggml-metal.m +2 -2
  12. package/cpp/ggml.c +0 -1276
  13. package/cpp/ggml.h +0 -140
  14. package/cpp/gguf.cpp +1325 -0
  15. package/cpp/gguf.h +202 -0
  16. package/cpp/llama-adapter.cpp +346 -0
  17. package/cpp/llama-adapter.h +73 -0
  18. package/cpp/llama-arch.cpp +1434 -0
  19. package/cpp/llama-arch.h +395 -0
  20. package/cpp/llama-batch.cpp +368 -0
  21. package/cpp/llama-batch.h +88 -0
  22. package/cpp/llama-chat.cpp +567 -0
  23. package/cpp/llama-chat.h +51 -0
  24. package/cpp/llama-context.cpp +1771 -0
  25. package/cpp/llama-context.h +128 -0
  26. package/cpp/llama-cparams.cpp +1 -0
  27. package/cpp/llama-cparams.h +37 -0
  28. package/cpp/llama-cpp.h +30 -0
  29. package/cpp/llama-grammar.cpp +1 -0
  30. package/cpp/llama-grammar.h +3 -1
  31. package/cpp/llama-hparams.cpp +71 -0
  32. package/cpp/llama-hparams.h +140 -0
  33. package/cpp/llama-impl.cpp +167 -0
  34. package/cpp/llama-impl.h +16 -136
  35. package/cpp/llama-kv-cache.cpp +718 -0
  36. package/cpp/llama-kv-cache.h +218 -0
  37. package/cpp/llama-mmap.cpp +589 -0
  38. package/cpp/llama-mmap.h +67 -0
  39. package/cpp/llama-model-loader.cpp +1011 -0
  40. package/cpp/llama-model-loader.h +158 -0
  41. package/cpp/llama-model.cpp +2202 -0
  42. package/cpp/llama-model.h +391 -0
  43. package/cpp/llama-sampling.cpp +117 -4
  44. package/cpp/llama-vocab.cpp +21 -28
  45. package/cpp/llama-vocab.h +13 -1
  46. package/cpp/llama.cpp +8437 -19421
  47. package/cpp/llama.cpp.rej +23 -0
  48. package/cpp/llama.h +31 -6
  49. package/cpp/rn-llama.hpp +39 -37
  50. package/cpp/sgemm.cpp +776 -70
  51. package/cpp/unicode.cpp +6 -0
  52. package/package.json +1 -1
@@ -0,0 +1,23 @@
1
+ --- llama.cpp.orig 2024-11-02 12:42:13
2
+ +++ llama.cpp 2024-11-02 13:00:37
3
+ @@ -1941,16 +1952,16 @@
4
+
5
+ if (prefetch > 0) {
6
+ // advise the kernel to preload the mapped memory
7
+ - if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
8
+ - LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
9
+ + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
10
+ + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
11
+ strerror(errno));
12
+ }
13
+ }
14
+ if (numa) {
15
+ // advise the kernel not to use readahead
16
+ // (because the next page might not belong on the same node)
17
+ - if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
18
+ - LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
19
+ + if (madvise(addr, file->size, MADV_RANDOM)) {
20
+ + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
21
+ strerror(errno));
22
+ }
23
+ }
package/cpp/llama.h CHANGED
@@ -35,7 +35,6 @@
35
35
 
36
36
  #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
37
37
 
38
- // TODO: use everywhere in the implementation
39
38
  #define LLAMA_TOKEN_NULL -1
40
39
 
41
40
  #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
@@ -106,6 +105,7 @@ extern "C" {
106
105
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
107
106
  LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
108
107
  LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
108
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
109
109
  };
110
110
 
111
111
  enum llama_rope_type {
@@ -386,6 +386,7 @@ extern "C" {
386
386
  } llama_chat_message;
387
387
 
388
388
  // lora adapter
389
+ // TODO: rename to llama_adapter_lora
389
390
  struct llama_lora_adapter;
390
391
 
391
392
  // Helpers for getting default parameters
@@ -413,11 +414,19 @@ extern "C" {
413
414
  // Call once at the end of the program - currently only used for MPI
414
415
  LLAMA_API void llama_backend_free(void);
415
416
 
416
- LLAMA_API struct llama_model * llama_load_model_from_file(
417
+ DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
418
+ const char * path_model,
419
+ struct llama_model_params params),
420
+ "use llama_model_load_from_file instead");
421
+
422
+ LLAMA_API struct llama_model * llama_model_load_from_file(
417
423
  const char * path_model,
418
424
  struct llama_model_params params);
419
425
 
420
- LLAMA_API void llama_free_model(struct llama_model * model);
426
+ DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
427
+ "use llama_model_free instead");
428
+
429
+ LLAMA_API void llama_model_free(struct llama_model * model);
421
430
 
422
431
  // TODO: rename to llama_init_from_model
423
432
  LLAMA_API struct llama_context * llama_new_context_with_model(
@@ -502,14 +511,19 @@ extern "C" {
502
511
  const char * fname_out,
503
512
  const llama_model_quantize_params * params);
504
513
 
514
+ //
515
+ // Adapters
516
+ //
517
+
505
518
  // Load a LoRA adapter from file
506
- // The loaded adapter will be associated to the given model, and will be free when the model is deleted
519
+ // TODO: rename to llama_adapter_lora_init
507
520
  LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
508
521
  struct llama_model * model,
509
522
  const char * path_lora);
510
523
 
511
524
  // Add a loaded LoRA adapter to given context
512
525
  // This will not modify model's weight
526
+ // TODO: rename to llama_set_adapter_lora
513
527
  LLAMA_API int32_t llama_lora_adapter_set(
514
528
  struct llama_context * ctx,
515
529
  struct llama_lora_adapter * adapter,
@@ -517,16 +531,18 @@ extern "C" {
517
531
 
518
532
  // Remove a specific LoRA adapter from given context
519
533
  // Return -1 if the adapter is not present in the context
534
+ // TODO: rename to llama_rm_adapter_lora
520
535
  LLAMA_API int32_t llama_lora_adapter_remove(
521
536
  struct llama_context * ctx,
522
537
  struct llama_lora_adapter * adapter);
523
538
 
524
539
  // Remove all LoRA adapters from given context
525
- LLAMA_API void llama_lora_adapter_clear(
526
- struct llama_context * ctx);
540
+ // TODO: rename to llama_clear_adapter_lora
541
+ LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
527
542
 
528
543
  // Manually free a LoRA adapter
529
544
  // Note: loaded adapters will be free when the associated model is deleted
545
+ // TODO: rename to llama_adapter_lora_free
530
546
  LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
531
547
 
532
548
  // Apply a loaded control vector to a llama_context, or if data is NULL, clear
@@ -535,6 +551,7 @@ extern "C" {
535
551
  // to an n_embd x n_layers buffer starting from layer 1.
536
552
  // il_start and il_end are the layer range the vector should apply to (both inclusive)
537
553
  // See llama_control_vector_load in common to load a control vector.
554
+ // TODO: rename to llama_adapter_cvec_apply
538
555
  LLAMA_API int32_t llama_control_vector_apply(
539
556
  struct llama_context * lctx,
540
557
  const float * data,
@@ -547,6 +564,8 @@ extern "C" {
547
564
  // KV cache
548
565
  //
549
566
 
567
+ // TODO: remove llama_kv_cache_view_* API
568
+
550
569
  // Information associated with an individual cell in the KV cache view.
551
570
  struct llama_kv_cache_view_cell {
552
571
  // The position for this cell. Takes KV cache shifts into account.
@@ -593,8 +612,11 @@ extern "C" {
593
612
  LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
594
613
 
595
614
  // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
615
+ // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
596
616
  LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
597
617
 
618
+ ///
619
+
598
620
  // Returns the number of tokens in the KV cache (slow, use only for debug)
599
621
  // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
600
622
  LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
@@ -664,6 +686,9 @@ extern "C" {
664
686
  struct llama_context * ctx,
665
687
  llama_seq_id seq_id);
666
688
 
689
+ // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
690
+ // how to avoid this?
691
+
667
692
  // Defragment the KV cache
668
693
  // This will be applied:
669
694
  // - lazily on next llama_decode()
package/cpp/rn-llama.hpp CHANGED
@@ -8,6 +8,7 @@
8
8
  #include "llama.h"
9
9
  #include "llama-impl.h"
10
10
  #include "sampling.h"
11
+ #include "llama-cpp.h"
11
12
 
12
13
  namespace rnllama {
13
14
 
@@ -187,7 +188,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
187
188
  }
188
189
 
189
190
  template <class Iter>
190
- static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
191
+ static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end)
191
192
  {
192
193
  std::string ret;
193
194
  for (; begin != end; ++begin)
@@ -214,11 +215,11 @@ struct llama_rn_context
214
215
 
215
216
  common_params params;
216
217
 
217
- llama_model *model = nullptr;
218
+ llama_model_ptr model = nullptr;
218
219
  float loading_progress = 0;
219
220
  bool is_load_interrupted = false;
220
221
 
221
- llama_context *ctx = nullptr;
222
+ llama_context_ptr ctx = nullptr;
222
223
  common_sampler *ctx_sampling = nullptr;
223
224
 
224
225
  int n_ctx;
@@ -234,12 +235,12 @@ struct llama_rn_context
234
235
  {
235
236
  if (ctx)
236
237
  {
237
- llama_free(ctx);
238
+ llama_free(ctx.get());
238
239
  ctx = nullptr;
239
240
  }
240
241
  if (model)
241
242
  {
242
- llama_free_model(model);
243
+ llama_model_free(model.get());
243
244
  model = nullptr;
244
245
  }
245
246
  if (ctx_sampling != nullptr)
@@ -273,7 +274,7 @@ struct llama_rn_context
273
274
  if (ctx_sampling != nullptr) {
274
275
  common_sampler_free(ctx_sampling);
275
276
  }
276
- ctx_sampling = common_sampler_init(model, params.sampling);
277
+ ctx_sampling = common_sampler_init(model.get(), params.sampling);
277
278
  return ctx_sampling != nullptr;
278
279
  }
279
280
 
@@ -281,26 +282,26 @@ struct llama_rn_context
281
282
  {
282
283
  params = params_;
283
284
  common_init_result result = common_init_from_params(params);
284
- model = result.model;
285
- ctx = result.context;
285
+ model = std::move(result.model);
286
+ ctx = std::move(result.context);
286
287
  if (model == nullptr)
287
288
  {
288
289
  LOG_ERROR("unable to load model: %s", params_.model.c_str());
289
290
  return false;
290
291
  }
291
292
  LOG_VERBOSE("getting n_ctx");
292
- n_ctx = llama_n_ctx(ctx);
293
+ n_ctx = llama_n_ctx(ctx.get());
293
294
  return true;
294
295
  }
295
296
 
296
297
  bool validateModelChatTemplate() const {
297
298
  std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
298
299
  std::string template_key = "tokenizer.chat_template";
299
- int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
300
+ int32_t res = llama_model_meta_val_str(model.get(), template_key.c_str(), model_template.data(), model_template.size());
300
301
  if (res >= 0) {
301
302
  llama_chat_message chat[] = {{"user", "test"}};
302
303
  std::string tmpl = std::string(model_template.data(), model_template.size());
303
- int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
304
+ int32_t chat_res = llama_chat_apply_template(model.get(), tmpl.c_str(), chat, 1, true, nullptr, 0);
304
305
  return chat_res > 0;
305
306
  }
306
307
  return res > 0;
@@ -330,7 +331,7 @@ struct llama_rn_context
330
331
 
331
332
  void loadPrompt()
332
333
  {
333
- std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
334
+ std::vector<llama_token> prompt_tokens = ::common_tokenize(model.get(), params.prompt, true, true);
334
335
  num_prompt_tokens = prompt_tokens.size();
335
336
 
336
337
  // LOG tokens
@@ -358,7 +359,7 @@ struct llama_rn_context
358
359
 
359
360
  // do Context Shift , may be buggy! TODO: Verify functionality
360
361
  if(!params.embedding){
361
- purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
362
+ purge_missing_tokens(ctx.get(), embd, prompt_tokens, params.n_predict, params.n_ctx);
362
363
  }
363
364
 
364
365
  // push the prompt into the sampling context (do not apply grammar)
@@ -379,7 +380,7 @@ struct llama_rn_context
379
380
  }
380
381
 
381
382
  // since #3228 we now have to manually manage the KV cache
382
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
383
+ llama_kv_cache_seq_rm(ctx.get(), 0, n_past, -1);
383
384
 
384
385
  LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
385
386
  n_past,
@@ -394,7 +395,7 @@ struct llama_rn_context
394
395
  {
395
396
  // number of tokens to keep when resetting context
396
397
  n_remain = params.n_predict;
397
- llama_perf_context_reset(ctx);
398
+ llama_perf_context_reset(ctx.get());
398
399
  is_predicting = true;
399
400
  }
400
401
 
@@ -410,8 +411,8 @@ struct llama_rn_context
410
411
  const int n_left = n_past - params.n_keep - 1;
411
412
  const int n_discard = n_left/2;
412
413
 
413
- llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
414
- llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
414
+ llama_kv_cache_seq_rm (ctx.get(), 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
415
+ llama_kv_cache_seq_add(ctx.get(), 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
415
416
 
416
417
  for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
417
418
  {
@@ -437,13 +438,14 @@ struct llama_rn_context
437
438
  {
438
439
  n_eval = params.n_batch;
439
440
  }
440
- if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
441
+ if (llama_decode(ctx.get(), llama_batch_get_one(&embd[n_past], n_eval)))
441
442
  {
443
+
442
444
  LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
443
445
  n_eval,
444
446
  n_past,
445
447
  params.cpuparams.n_threads,
446
- tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
448
+ tokens_to_str(ctx.get(), embd.cbegin() + n_past, embd.cend()).c_str()
447
449
  );
448
450
  has_next_token = false;
449
451
  return result;
@@ -461,16 +463,16 @@ struct llama_rn_context
461
463
  if (params.n_predict == 0)
462
464
  {
463
465
  has_next_token = false;
464
- result.tok = llama_token_eos(model);
466
+ result.tok = llama_token_eos(model.get());
465
467
  return result;
466
468
  }
467
469
 
468
470
  {
469
471
  // out of user input, sample next token
470
472
  std::vector<llama_token_data> candidates;
471
- candidates.reserve(llama_n_vocab(model));
473
+ candidates.reserve(llama_n_vocab(model.get()));
472
474
 
473
- result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
475
+ result.tok = common_sampler_sample(ctx_sampling, ctx.get(), -1);
474
476
 
475
477
  llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
476
478
 
@@ -501,7 +503,7 @@ struct llama_rn_context
501
503
  // decrement remaining sampling budget
502
504
  --n_remain;
503
505
 
504
- if (!embd.empty() && embd.back() == llama_token_eos(model))
506
+ if (!embd.empty() && embd.back() == llama_token_eos(model.get()))
505
507
  {
506
508
  // stopping_word = llama_token_to_piece(ctx, embd.back());
507
509
  has_next_token = false;
@@ -550,7 +552,7 @@ struct llama_rn_context
550
552
  {
551
553
  const completion_token_output token_with_probs = nextToken();
552
554
 
553
- const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
555
+ const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx.get(), token_with_probs.tok);
554
556
  generated_text += token_text;
555
557
 
556
558
  if (params.sampling.n_probs > 0)
@@ -606,7 +608,7 @@ struct llama_rn_context
606
608
 
607
609
  std::vector<float> getEmbedding(common_params &embd_params)
608
610
  {
609
- static const int n_embd = llama_n_embd(llama_get_model(ctx));
611
+ static const int n_embd = llama_n_embd(llama_get_model(ctx.get()));
610
612
  if (!embd_params.embedding)
611
613
  {
612
614
  LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
@@ -614,12 +616,12 @@ struct llama_rn_context
614
616
  }
615
617
  float *data;
616
618
 
617
- const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
619
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx.get());
618
620
  printf("pooling_type: %d\n", pooling_type);
619
621
  if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
620
- data = llama_get_embeddings(ctx);
622
+ data = llama_get_embeddings(ctx.get());
621
623
  } else {
622
- data = llama_get_embeddings_seq(ctx, 0);
624
+ data = llama_get_embeddings_seq(ctx.get(), 0);
623
625
  }
624
626
 
625
627
  if (!data) {
@@ -661,15 +663,15 @@ struct llama_rn_context
661
663
  }
662
664
  batch.logits[batch.n_tokens - 1] = 1; // true
663
665
 
664
- llama_kv_cache_clear(ctx);
666
+ llama_kv_cache_clear(ctx.get());
665
667
 
666
668
  const int64_t t_pp_start = llama_time_us();
667
- if (llama_decode(ctx, batch) != 0)
669
+ if (llama_decode(ctx.get(), batch) != 0)
668
670
  {
669
671
  LOG_ERROR("llama_decode() failed during prompt", "");
670
672
  }
671
673
  const int64_t t_pp_end = llama_time_us();
672
- llama_kv_cache_clear(ctx);
674
+ llama_kv_cache_clear(ctx.get());
673
675
 
674
676
  if (is_interrupted) break;
675
677
 
@@ -684,7 +686,7 @@ struct llama_rn_context
684
686
  llama_batch_add(&batch, 0, i, {j}, true);
685
687
  }
686
688
 
687
- if (llama_decode(ctx, batch) != 0)
689
+ if (llama_decode(ctx.get(), batch) != 0)
688
690
  {
689
691
  LOG_ERROR("llama_decode() failed during text generation", "");
690
692
  }
@@ -693,7 +695,7 @@ struct llama_rn_context
693
695
 
694
696
  const int64_t t_tg_end = llama_time_us();
695
697
 
696
- llama_kv_cache_clear(ctx);
698
+ llama_kv_cache_clear(ctx.get());
697
699
 
698
700
  const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
699
701
  const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
@@ -719,14 +721,14 @@ struct llama_rn_context
719
721
  tg_std = 0;
720
722
  }
721
723
 
722
- if (is_interrupted) llama_kv_cache_clear(ctx);
724
+ if (is_interrupted) llama_kv_cache_clear(ctx.get());
723
725
  is_predicting = false;
724
726
 
725
727
  char model_desc[128];
726
- llama_model_desc(model, model_desc, sizeof(model_desc));
728
+ llama_model_desc(model.get(), model_desc, sizeof(model_desc));
727
729
  return std::string("[\"") + model_desc + std::string("\",") +
728
- std::to_string(llama_model_size(model)) + std::string(",") +
729
- std::to_string(llama_model_n_params(model)) + std::string(",") +
730
+ std::to_string(llama_model_size(model.get())) + std::string(",") +
731
+ std::to_string(llama_model_n_params(model.get())) + std::string(",") +
730
732
  std::to_string(pp_avg) + std::string(",") +
731
733
  std::to_string(pp_std) + std::string(",") +
732
734
  std::to_string(tg_avg) + std::string(",") +