cui-llama.rn 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/android/src/main/CMakeLists.txt +14 -8
  2. package/android/src/main/jni.cpp +38 -37
  3. package/cpp/common.cpp +50 -30
  4. package/cpp/common.h +32 -13
  5. package/cpp/ggml-alloc.c +0 -1
  6. package/cpp/ggml-backend-reg.cpp +79 -49
  7. package/cpp/ggml-backend.cpp +5 -2
  8. package/cpp/ggml-cpp.h +1 -0
  9. package/cpp/ggml-cpu-aarch64.cpp +57 -72
  10. package/cpp/ggml-cpu-quants.c +5 -1
  11. package/cpp/ggml-cpu.c +6 -6
  12. package/cpp/ggml-cpu.cpp +9 -0
  13. package/cpp/ggml-impl.h +11 -0
  14. package/cpp/ggml-metal.m +2 -2
  15. package/cpp/ggml.c +129 -1388
  16. package/cpp/ggml.h +29 -152
  17. package/cpp/gguf.cpp +1325 -0
  18. package/cpp/gguf.h +202 -0
  19. package/cpp/llama-adapter.cpp +346 -0
  20. package/cpp/llama-adapter.h +73 -0
  21. package/cpp/llama-arch.cpp +1434 -0
  22. package/cpp/llama-arch.h +395 -0
  23. package/cpp/llama-batch.cpp +368 -0
  24. package/cpp/llama-batch.h +88 -0
  25. package/cpp/llama-chat.cpp +567 -0
  26. package/cpp/llama-chat.h +51 -0
  27. package/cpp/llama-context.cpp +1771 -0
  28. package/cpp/llama-context.h +128 -0
  29. package/cpp/llama-cparams.cpp +1 -0
  30. package/cpp/llama-cparams.h +37 -0
  31. package/cpp/llama-cpp.h +30 -0
  32. package/cpp/llama-grammar.cpp +16 -15
  33. package/cpp/llama-grammar.h +5 -6
  34. package/cpp/llama-hparams.cpp +71 -0
  35. package/cpp/llama-hparams.h +140 -0
  36. package/cpp/llama-impl.cpp +167 -0
  37. package/cpp/llama-impl.h +16 -136
  38. package/cpp/llama-kv-cache.cpp +718 -0
  39. package/cpp/llama-kv-cache.h +218 -0
  40. package/cpp/llama-mmap.cpp +589 -0
  41. package/cpp/llama-mmap.h +67 -0
  42. package/cpp/llama-model-loader.cpp +1011 -0
  43. package/cpp/llama-model-loader.h +158 -0
  44. package/cpp/llama-model.cpp +2202 -0
  45. package/cpp/llama-model.h +391 -0
  46. package/cpp/llama-sampling.cpp +117 -4
  47. package/cpp/llama-vocab.cpp +26 -29
  48. package/cpp/llama-vocab.h +14 -2
  49. package/cpp/llama.cpp +8839 -19131
  50. package/cpp/llama.cpp.rej +23 -0
  51. package/cpp/llama.h +31 -9
  52. package/cpp/rn-llama.hpp +39 -37
  53. package/cpp/sgemm.cpp +1091 -378
  54. package/cpp/sgemm.h +2 -2
  55. package/cpp/unicode.cpp +6 -0
  56. package/package.json +1 -1
@@ -9,17 +9,23 @@ include_directories(${RNLLAMA_LIB_DIR})
9
9
 
10
10
  set(
11
11
  SOURCE_FILES
12
- ${RNLLAMA_LIB_DIR}/llama-grammar.cpp
13
- ${RNLLAMA_LIB_DIR}/llama-sampling.cpp
14
- ${RNLLAMA_LIB_DIR}/llama-vocab.cpp
15
- ${RNLLAMA_LIB_DIR}/log.cpp
16
-
17
- #${RNLLAMA_LIB_DIR}/amx/amx.cpp
18
- #${RNLLAMA_LIB_DIR}/amx/mmq.cpp
19
12
 
13
+ ${RNLLAMA_LIB_DIR}/common.cpp
20
14
  ${RNLLAMA_LIB_DIR}/llama-grammar.cpp
21
15
  ${RNLLAMA_LIB_DIR}/llama-sampling.cpp
22
16
  ${RNLLAMA_LIB_DIR}/llama-vocab.cpp
17
+ ${RNLLAMA_LIB_DIR}/llama-chat.cpp
18
+ ${RNLLAMA_LIB_DIR}/llama-mmap.cpp
19
+ ${RNLLAMA_LIB_DIR}/llama-context.cpp
20
+ ${RNLLAMA_LIB_DIR}/llama-kv-cache.cpp
21
+ ${RNLLAMA_LIB_DIR}/llama-model-loader.cpp
22
+ ${RNLLAMA_LIB_DIR}/llama-model.cpp
23
+ ${RNLLAMA_LIB_DIR}/llama-batch.cpp
24
+ ${RNLLAMA_LIB_DIR}/llama-arch.cpp
25
+ ${RNLLAMA_LIB_DIR}/llama-cparams.cpp
26
+ ${RNLLAMA_LIB_DIR}/llama-hparams.cpp
27
+ ${RNLLAMA_LIB_DIR}/llama-adapter.cpp
28
+ ${RNLLAMA_LIB_DIR}/llama-impl.cpp
23
29
  ${RNLLAMA_LIB_DIR}/log.cpp
24
30
  ${RNLLAMA_LIB_DIR}/json.hpp
25
31
  ${RNLLAMA_LIB_DIR}/json-schema-to-grammar.cpp
@@ -28,6 +34,7 @@ set(
28
34
  ${RNLLAMA_LIB_DIR}/ggml-backend.cpp
29
35
  ${RNLLAMA_LIB_DIR}/ggml-backend-reg.cpp
30
36
  ${RNLLAMA_LIB_DIR}/ggml.c
37
+ ${RNLLAMA_LIB_DIR}/gguf.cpp
31
38
  ${RNLLAMA_LIB_DIR}/ggml-cpu.c
32
39
  ${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
33
40
  ${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.cpp
@@ -35,7 +42,6 @@ set(
35
42
  ${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
36
43
  ${RNLLAMA_LIB_DIR}/ggml-threading.cpp
37
44
  ${RNLLAMA_LIB_DIR}/ggml-quants.c
38
- ${RNLLAMA_LIB_DIR}/common.cpp
39
45
  ${RNLLAMA_LIB_DIR}/sampling.cpp
40
46
  ${RNLLAMA_LIB_DIR}/unicode-data.cpp
41
47
  ${RNLLAMA_LIB_DIR}/unicode.cpp
@@ -11,7 +11,8 @@
11
11
  #include <unordered_map>
12
12
  #include "llama.h"
13
13
  #include "llama-impl.h"
14
- #include "ggml.h"
14
+ #include "llama-context.h"
15
+ #include "gguf.h"
15
16
  #include "rn-llama.hpp"
16
17
 
17
18
  #define UNUSED(x) (void)(x)
@@ -336,17 +337,17 @@ Java_com_rnllama_LlamaContext_initContext(
336
337
 
337
338
  LOGI("[RNLlama] is_model_loaded %s", (is_model_loaded ? "true" : "false"));
338
339
  if (is_model_loaded) {
339
- if (embedding && llama_model_has_encoder(llama->model) && llama_model_has_decoder(llama->model)) {
340
+ if (embedding && llama_model_has_encoder(llama->model.get()) && llama_model_has_decoder(llama->model.get())) {
340
341
  LOGI("[RNLlama] computing embeddings in encoder-decoder models is not supported");
341
- llama_free(llama->ctx);
342
+ llama_free(llama->ctx.get());
342
343
  return -1;
343
344
  }
344
- context_map[(long) llama->ctx] = llama;
345
+ context_map[(long) llama->ctx.get()] = llama;
345
346
  } else {
346
- llama_free(llama->ctx);
347
+ llama_free(llama->ctx.get());
347
348
  }
348
349
 
349
- return reinterpret_cast<jlong>(llama->ctx);
350
+ return reinterpret_cast<jlong>(llama->ctx.get());
350
351
  }
351
352
 
352
353
 
@@ -372,13 +373,13 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
372
373
  UNUSED(thiz);
373
374
  auto llama = context_map[(long) context_ptr];
374
375
 
375
- int count = llama_model_meta_count(llama->model);
376
+ int count = llama_model_meta_count(llama->model.get());
376
377
  auto meta = createWriteableMap(env);
377
378
  for (int i = 0; i < count; i++) {
378
379
  char key[256];
379
- llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
380
+ llama_model_meta_key_by_index(llama->model.get(), i, key, sizeof(key));
380
381
  char val[2048];
381
- llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
382
+ llama_model_meta_val_str_by_index(llama->model.get(), i, val, sizeof(val));
382
383
 
383
384
  putString(env, meta, key, val);
384
385
  }
@@ -386,10 +387,10 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
386
387
  auto result = createWriteableMap(env);
387
388
 
388
389
  char desc[1024];
389
- llama_model_desc(llama->model, desc, sizeof(desc));
390
+ llama_model_desc(llama->model.get(), desc, sizeof(desc));
390
391
  putString(env, result, "desc", desc);
391
- putDouble(env, result, "size", llama_model_size(llama->model));
392
- putDouble(env, result, "nParams", llama_model_n_params(llama->model));
392
+ putDouble(env, result, "size", llama_model_size(llama->model.get()));
393
+ putDouble(env, result, "nParams", llama_model_n_params(llama->model.get()));
393
394
  putBoolean(env, result, "isChatTemplateSupported", llama->validateModelChatTemplate());
394
395
  putMap(env, result, "metadata", meta);
395
396
 
@@ -431,7 +432,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
431
432
  }
432
433
 
433
434
  const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
434
- std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
435
+ std::string formatted_chat = common_chat_apply_template(llama->model.get(), tmpl_chars, chat, true);
435
436
 
436
437
  return env->NewStringUTF(formatted_chat.c_str());
437
438
  }
@@ -450,7 +451,7 @@ Java_com_rnllama_LlamaContext_loadSession(
450
451
  auto result = createWriteableMap(env);
451
452
  size_t n_token_count_out = 0;
452
453
  llama->embd.resize(llama->params.n_ctx);
453
- if (!llama_state_load_file(llama->ctx, path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
454
+ if (!llama_state_load_file(llama->ctx.get(), path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
454
455
  env->ReleaseStringUTFChars(path, path_chars);
455
456
 
456
457
  putString(env, result, "error", "Failed to load session");
@@ -459,7 +460,7 @@ Java_com_rnllama_LlamaContext_loadSession(
459
460
  llama->embd.resize(n_token_count_out);
460
461
  env->ReleaseStringUTFChars(path, path_chars);
461
462
 
462
- const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
463
+ const std::string text = rnllama::tokens_to_str(llama->ctx.get(), llama->embd.cbegin(), llama->embd.cend());
463
464
  putInt(env, result, "tokens_loaded", n_token_count_out);
464
465
  putString(env, result, "prompt", text.c_str());
465
466
  return reinterpret_cast<jobject>(result);
@@ -481,7 +482,7 @@ Java_com_rnllama_LlamaContext_saveSession(
481
482
  std::vector<llama_token> session_tokens = llama->embd;
482
483
  int default_size = session_tokens.size();
483
484
  int save_size = size > 0 && size <= default_size ? size : default_size;
484
- if (!llama_state_save_file(llama->ctx, path_chars, session_tokens.data(), save_size)) {
485
+ if (!llama_state_save_file(llama->ctx.get(), path_chars, session_tokens.data(), save_size)) {
485
486
  env->ReleaseStringUTFChars(path, path_chars);
486
487
  return -1;
487
488
  }
@@ -499,13 +500,13 @@ static inline jobject tokenProbsToMap(
499
500
  for (const auto &prob : probs) {
500
501
  auto probsForToken = createWritableArray(env);
501
502
  for (const auto &p : prob.probs) {
502
- std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, p.tok);
503
+ std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), p.tok);
503
504
  auto probResult = createWriteableMap(env);
504
505
  putString(env, probResult, "tok_str", tokStr.c_str());
505
506
  putDouble(env, probResult, "prob", p.prob);
506
507
  pushMap(env, probsForToken, probResult);
507
508
  }
508
- std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, prob.tok);
509
+ std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), prob.tok);
509
510
  auto tokenResult = createWriteableMap(env);
510
511
  putString(env, tokenResult, "content", tokStr.c_str());
511
512
  putArray(env, tokenResult, "probs", probsForToken);
@@ -555,7 +556,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
555
556
 
556
557
  llama->rewind();
557
558
 
558
- //llama_reset_timings(llama->ctx);
559
+ //llama_reset_timings(llama->ctx.get());
559
560
 
560
561
  llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
561
562
  llama->params.sampling.seed = (seed == -1) ? time(NULL) : seed;
@@ -593,7 +594,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
593
594
 
594
595
  sparams.logit_bias.clear();
595
596
  if (ignore_eos) {
596
- sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
597
+ sparams.logit_bias[llama_token_eos(llama->model.get())].bias = -INFINITY;
597
598
  }
598
599
 
599
600
  // dry break seq
@@ -612,7 +613,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
612
613
  sparams.dry_sequence_breakers = dry_sequence_breakers_vector;
613
614
 
614
615
  // logit bias
615
- const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
616
+ const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx.get()));
616
617
  jsize logit_bias_len = env->GetArrayLength(logit_bias);
617
618
 
618
619
  for (jsize i = 0; i < logit_bias_len; i++) {
@@ -659,7 +660,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
659
660
  if (token_with_probs.tok == -1 || llama->incomplete) {
660
661
  continue;
661
662
  }
662
- const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
663
+ const std::string token_text = common_token_to_piece(llama->ctx.get(), token_with_probs.tok);
663
664
 
664
665
  size_t pos = std::min(sent_count, llama->generated_text.size());
665
666
 
@@ -694,7 +695,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
694
695
  putString(env, tokenResult, "token", to_send.c_str());
695
696
 
696
697
  if (llama->params.sampling.n_probs > 0) {
697
- const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
698
+ const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx.get(), to_send, false);
698
699
  size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
699
700
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
700
701
  if (probs_pos < probs_stop_pos) {
@@ -711,7 +712,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
711
712
  }
712
713
  }
713
714
 
714
- llama_perf_context_print(llama->ctx);
715
+ llama_perf_context_print(llama->ctx.get());
715
716
  llama->is_predicting = false;
716
717
 
717
718
  auto result = createWriteableMap(env);
@@ -726,7 +727,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
726
727
  putString(env, result, "stopping_word", llama->stopping_word.c_str());
727
728
  putInt(env, result, "tokens_cached", llama->n_past);
728
729
 
729
- const auto timings_token = llama_perf_context(llama -> ctx);
730
+ const auto timings_token = llama_perf_context(llama -> ctx.get());
730
731
 
731
732
  auto timingsResult = createWriteableMap(env);
732
733
  putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
@@ -770,7 +771,7 @@ Java_com_rnllama_LlamaContext_tokenize(
770
771
  const char *text_chars = env->GetStringUTFChars(text, nullptr);
771
772
 
772
773
  const std::vector<llama_token> toks = common_tokenize(
773
- llama->ctx,
774
+ llama->ctx.get(),
774
775
  text_chars,
775
776
  false
776
777
  );
@@ -797,7 +798,7 @@ Java_com_rnllama_LlamaContext_detokenize(
797
798
  toks.push_back(tokens_ptr[i]);
798
799
  }
799
800
 
800
- auto text = rnllama::tokens_to_str(llama->ctx, toks.cbegin(), toks.cend());
801
+ auto text = rnllama::tokens_to_str(llama->ctx.get(), toks.cbegin(), toks.cend());
801
802
 
802
803
  env->ReleaseIntArrayElements(tokens, tokens_ptr, 0);
803
804
 
@@ -834,7 +835,7 @@ Java_com_rnllama_LlamaContext_embedding(
834
835
 
835
836
  llama->rewind();
836
837
 
837
- llama_perf_context_reset(llama->ctx);
838
+ llama_perf_context_reset(llama->ctx.get());
838
839
 
839
840
  llama->params.prompt = text_chars;
840
841
 
@@ -860,7 +861,7 @@ Java_com_rnllama_LlamaContext_embedding(
860
861
 
861
862
  auto promptTokens = createWritableArray(env);
862
863
  for (const auto &tok : llama->embd) {
863
- pushString(env, promptTokens, common_token_to_piece(llama->ctx, tok).c_str());
864
+ pushString(env, promptTokens, common_token_to_piece(llama->ctx.get(), tok).c_str());
864
865
  }
865
866
  putArray(env, result, "prompt_tokens", promptTokens);
866
867
 
@@ -890,17 +891,17 @@ Java_com_rnllama_LlamaContext_freeContext(
890
891
  UNUSED(env);
891
892
  UNUSED(thiz);
892
893
  auto llama = context_map[(long) context_ptr];
893
- if (llama->model) {
894
- llama_free_model(llama->model);
894
+ if (llama->model.get()) {
895
+ llama_model_free(llama->model.get());
895
896
  }
896
- if (llama->ctx) {
897
- llama_free(llama->ctx);
897
+ if (llama->ctx.get()) {
898
+ llama_free(llama->ctx.get());
898
899
  }
899
- if (llama->ctx_sampling != nullptr)
900
+ /*if (llama->ctx.get()-> != nullptr)
900
901
  {
901
- common_sampler_free(llama->ctx_sampling);
902
- }
903
- context_map.erase((long) llama->ctx);
902
+ common_sampler_free(llama->ctx.get() -> _sampling);
903
+ }*/
904
+ context_map.erase((long) llama->ctx.get());
904
905
  }
905
906
 
906
907
  JNIEXPORT void JNICALL
package/cpp/common.cpp CHANGED
@@ -2,6 +2,9 @@
2
2
  #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
3
  #endif
4
4
 
5
+ #include "ggml.h"
6
+ #include "gguf.h"
7
+
5
8
  #include "common.h"
6
9
  #include "log.h"
7
10
  // Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
@@ -18,6 +21,7 @@
18
21
  #include <cstdarg>
19
22
  #include <cstring>
20
23
  #include <ctime>
24
+ #include <filesystem>
21
25
  #include <fstream>
22
26
  #include <iostream>
23
27
  #include <iterator>
@@ -68,7 +72,9 @@ char const *LLAMA_BUILD_TARGET = "unknown";
68
72
  #ifdef __linux__
69
73
  #include <linux/limits.h>
70
74
  #elif defined(_WIN32)
71
- #define PATH_MAX MAX_PATH
75
+ # if !defined(PATH_MAX)
76
+ # define PATH_MAX MAX_PATH
77
+ # endif
72
78
  #else
73
79
  #include <sys/syslimits.h>
74
80
  #endif
@@ -849,7 +855,7 @@ struct common_init_result common_init_from_params(common_params & params) {
849
855
  } else if (!params.model_url.empty()) {
850
856
  model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
851
857
  } else {
852
- model = llama_load_model_from_file(params.model.c_str(), mparams);
858
+ model = llama_model_load_from_file(params.model.c_str(), mparams);
853
859
  }
854
860
 
855
861
  if (model == NULL) {
@@ -876,7 +882,7 @@ struct common_init_result common_init_from_params(common_params & params) {
876
882
  }
877
883
 
878
884
  if (!ok) {
879
- llama_free_model(model);
885
+ llama_model_free(model);
880
886
 
881
887
  return iparams;
882
888
  }
@@ -887,14 +893,13 @@ struct common_init_result common_init_from_params(common_params & params) {
887
893
  llama_context * lctx = llama_new_context_with_model(model, cparams);
888
894
  if (lctx == NULL) {
889
895
  LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
890
- llama_free_model(model);
896
+ llama_model_free(model);
891
897
  return iparams;
892
898
  }
893
899
 
894
900
  if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
895
- LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
896
- llama_free_model(model);
897
- return iparams;
901
+ LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
902
+ params.ctx_shift = false;
898
903
  }
899
904
 
900
905
  if (!params.control_vectors.empty()) {
@@ -904,7 +909,7 @@ struct common_init_result common_init_from_params(common_params & params) {
904
909
  const auto cvec = common_control_vector_load(params.control_vectors);
905
910
  if (cvec.n_embd == -1) {
906
911
  llama_free(lctx);
907
- llama_free_model(model);
912
+ llama_model_free(model);
908
913
 
909
914
  return iparams;
910
915
  }
@@ -917,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
917
922
  params.control_vector_layer_end);
918
923
  if (err) {
919
924
  llama_free(lctx);
920
- llama_free_model(model);
925
+ llama_model_free(model);
921
926
 
922
927
  return iparams;
923
928
  }
@@ -925,20 +930,21 @@ struct common_init_result common_init_from_params(common_params & params) {
925
930
 
926
931
  // load and optionally apply lora adapters
927
932
  for (auto & la : params.lora_adapters) {
928
- common_lora_adapter_container loaded_la;
929
- loaded_la.path = la.path;
930
- loaded_la.scale = la.scale;
931
- loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
932
- if (loaded_la.adapter == nullptr) {
933
+ llama_lora_adapter_ptr lora;
934
+ lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
935
+ if (lora == nullptr) {
933
936
  LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
934
937
  llama_free(lctx);
935
- llama_free_model(model);
938
+ llama_model_free(model);
936
939
  return iparams;
937
940
  }
938
- iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
941
+
942
+ la.ptr = lora.get();
943
+ iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
939
944
  }
945
+
940
946
  if (!params.lora_init_without_apply) {
941
- common_lora_adapters_apply(lctx, iparams.lora_adapters);
947
+ common_lora_adapters_apply(lctx, params.lora_adapters);
942
948
  }
943
949
 
944
950
  if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -985,7 +991,7 @@ struct common_init_result common_init_from_params(common_params & params) {
985
991
  if (llama_model_has_encoder(model)) {
986
992
  llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
987
993
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
988
- if (decoder_start_token_id == -1) {
994
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
989
995
  decoder_start_token_id = bos;
990
996
  }
991
997
  tmp.clear();
@@ -999,17 +1005,17 @@ struct common_init_result common_init_from_params(common_params & params) {
999
1005
  llama_perf_context_reset(lctx);
1000
1006
  }
1001
1007
 
1002
- iparams.model = model;
1003
- iparams.context = lctx;
1008
+ iparams.model.reset(model);
1009
+ iparams.context.reset(lctx);
1004
1010
 
1005
1011
  return iparams;
1006
1012
  }
1007
1013
 
1008
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
1014
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
1009
1015
  llama_lora_adapter_clear(ctx);
1010
- for (auto & la : lora_adapters) {
1016
+ for (auto & la : lora) {
1011
1017
  if (la.scale != 0.0f) {
1012
- llama_lora_adapter_set(ctx, la.adapter, la.scale);
1018
+ llama_lora_adapter_set(ctx, la.ptr, la.scale);
1013
1019
  }
1014
1020
  }
1015
1021
  }
@@ -1105,7 +1111,7 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
1105
1111
  #define CURL_MAX_RETRY 3
1106
1112
  #define CURL_RETRY_DELAY_SECONDS 2
1107
1113
 
1108
- static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
1114
+ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
1109
1115
  int remaining_attempts = max_attempts;
1110
1116
 
1111
1117
  while (remaining_attempts > 0) {
@@ -1129,7 +1135,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
1129
1135
  }
1130
1136
 
1131
1137
  static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1132
-
1133
1138
  // Initialize libcurl
1134
1139
  std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1135
1140
  if (!curl) {
@@ -1159,8 +1164,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
1159
1164
  #endif
1160
1165
 
1161
1166
  // Check if the file already exists locally
1162
- struct stat model_file_info;
1163
- auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
1167
+ auto file_exists = std::filesystem::exists(path);
1164
1168
 
1165
1169
  // If the file exists, check its JSON metadata companion file.
1166
1170
  std::string metadata_path = path + ".json";
@@ -1202,11 +1206,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
1202
1206
  std::string etag;
1203
1207
  std::string last_modified;
1204
1208
  };
1209
+
1205
1210
  common_load_model_from_url_headers headers;
1211
+
1206
1212
  {
1207
1213
  typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1208
1214
  auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1209
- common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1215
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
1210
1216
 
1211
1217
  static std::regex header_regex("([^:]+): (.*)\r\n");
1212
1218
  static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1418,7 +1424,7 @@ struct llama_model * common_load_model_from_url(
1418
1424
  }
1419
1425
  }
1420
1426
 
1421
- return llama_load_model_from_file(local_path.c_str(), params);
1427
+ return llama_model_load_from_file(local_path.c_str(), params);
1422
1428
  }
1423
1429
 
1424
1430
  struct llama_model * common_load_model_from_hf(
@@ -1621,6 +1627,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
1621
1627
  // Chat template utils
1622
1628
  //
1623
1629
 
1630
+ std::string common_get_builtin_chat_template(const struct llama_model * model) {
1631
+ static const char * template_key = "tokenizer.chat_template";
1632
+ // call with NULL buffer to get the total size of the string
1633
+ int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
1634
+ if (res > 0) {
1635
+ std::vector<char> model_template(res + 1, 0);
1636
+ llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
1637
+ return std::string(model_template.data(), model_template.size() - 1);
1638
+ }
1639
+ return "";
1640
+ }
1641
+
1624
1642
  bool common_chat_verify_template(const std::string & tmpl) {
1625
1643
  llama_chat_message chat[] = {{"user", "test"}};
1626
1644
  int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
@@ -1790,7 +1808,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
1790
1808
  break;
1791
1809
  case 0: // max absolute
1792
1810
  for (int i = 0; i < n; i++) {
1793
- if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1811
+ if (sum < std::abs(inp[i])) {
1812
+ sum = std::abs(inp[i]);
1813
+ }
1794
1814
  }
1795
1815
  sum /= 32760.0; // make an int16 range
1796
1816
  break;
package/cpp/common.h CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  #pragma once
4
4
 
5
- #include "llama.h"
5
+ #include "llama-cpp.h"
6
6
 
7
7
  #include <string>
8
8
  #include <vector>
@@ -27,10 +27,8 @@
27
27
  struct common_lora_adapter_info {
28
28
  std::string path;
29
29
  float scale;
30
- };
31
30
 
32
- struct common_lora_adapter_container : common_lora_adapter_info {
33
- struct llama_lora_adapter * adapter;
31
+ struct llama_lora_adapter * ptr;
34
32
  };
35
33
 
36
34
  using llama_tokens = std::vector<llama_token>;
@@ -91,6 +89,7 @@ enum llama_example {
91
89
  LLAMA_EXAMPLE_LLAVA,
92
90
  LLAMA_EXAMPLE_LOOKUP,
93
91
  LLAMA_EXAMPLE_PARALLEL,
92
+ LLAMA_EXAMPLE_TTS,
94
93
 
95
94
  LLAMA_EXAMPLE_COUNT,
96
95
  };
@@ -170,6 +169,7 @@ struct common_params_sampling {
170
169
 
171
170
  struct common_params_speculative {
172
171
  std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
172
+
173
173
  int32_t n_ctx = 0; // draft context size
174
174
  int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
175
175
  int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@@ -183,6 +183,14 @@ struct common_params_speculative {
183
183
  std::string model = ""; // draft model for speculative decoding // NOLINT
184
184
  };
185
185
 
186
+ struct common_params_vocoder {
187
+ std::string hf_repo = ""; // HF repo // NOLINT
188
+ std::string hf_file = ""; // HF file // NOLINT
189
+
190
+ std::string model = ""; // model path // NOLINT
191
+ std::string model_url = ""; // model url to download // NOLINT
192
+ };
193
+
186
194
  struct common_params {
187
195
 
188
196
  void * progress_callback_user_data = nullptr;
@@ -229,8 +237,9 @@ struct common_params {
229
237
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
230
238
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
231
239
 
232
- struct common_params_sampling sampling;
240
+ struct common_params_sampling sampling;
233
241
  struct common_params_speculative speculative;
242
+ struct common_params_vocoder vocoder;
234
243
 
235
244
  std::string model = ""; // model path // NOLINT
236
245
  std::string model_alias = ""; // model alias // NOLINT
@@ -482,10 +491,12 @@ std::string fs_get_cache_file(const std::string & filename);
482
491
  // Model utils
483
492
  //
484
493
 
494
+ // note: defines object's lifetime
485
495
  struct common_init_result {
486
- struct llama_model * model = nullptr;
487
- struct llama_context * context = nullptr;
488
- std::vector<common_lora_adapter_container> lora_adapters;
496
+ llama_model_ptr model;
497
+ llama_context_ptr context;
498
+
499
+ std::vector<llama_lora_adapter_ptr> lora;
489
500
  };
490
501
 
491
502
  struct common_init_result common_init_from_params(common_params & params);
@@ -507,7 +518,7 @@ struct llama_model * common_load_model_from_hf(
507
518
  const struct llama_model_params & params);
508
519
 
509
520
  // clear LoRA adapters from context, then apply new list of adapters
510
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
521
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
511
522
 
512
523
  //
513
524
  // Batch utils
@@ -575,6 +586,9 @@ struct common_chat_msg {
575
586
  std::string content;
576
587
  };
577
588
 
589
+ // Get the built-in chat template for the model. Return empty string if not present.
590
+ std::string common_get_builtin_chat_template(const struct llama_model * model);
591
+
578
592
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
579
593
  bool common_chat_verify_template(const std::string & tmpl);
580
594
 
@@ -611,7 +625,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
611
625
  // Embedding utils
612
626
  //
613
627
 
614
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
628
+ // TODO: repace embd_norm with an enum
629
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
615
630
 
616
631
  float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
617
632
 
@@ -640,6 +655,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
640
655
  // Split utils
641
656
  //
642
657
 
643
- static const char * const LLM_KV_SPLIT_NO = "split.no";
644
- static const char * const LLM_KV_SPLIT_COUNT = "split.count";
645
- static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
658
+ namespace {
659
+
660
+ const char * const LLM_KV_SPLIT_NO = "split.no";
661
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
662
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
663
+
664
+ }
package/cpp/ggml-alloc.c CHANGED
@@ -534,7 +534,6 @@ static void lm_ggml_gallocr_allocate_node(lm_ggml_gallocr_t galloc, struct lm_gg
534
534
  size_t offset = lm_ggml_dyn_tallocr_alloc(alloc, size, node);
535
535
  hn->buffer_id = buffer_id;
536
536
  hn->offset = offset;
537
- return;
538
537
  }
539
538
  }
540
539