cui-llama.rn 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/android/src/main/CMakeLists.txt +14 -8
  2. package/android/src/main/jni.cpp +38 -37
  3. package/cpp/common.cpp +43 -26
  4. package/cpp/common.h +18 -11
  5. package/cpp/ggml-backend-reg.cpp +5 -0
  6. package/cpp/ggml-backend.cpp +5 -2
  7. package/cpp/ggml-cpp.h +1 -0
  8. package/cpp/ggml-cpu-aarch64.cpp +6 -1
  9. package/cpp/ggml-cpu-quants.c +5 -1
  10. package/cpp/ggml-impl.h +11 -16
  11. package/cpp/ggml-metal.m +2 -2
  12. package/cpp/ggml.c +0 -1276
  13. package/cpp/ggml.h +0 -140
  14. package/cpp/gguf.cpp +1325 -0
  15. package/cpp/gguf.h +202 -0
  16. package/cpp/llama-adapter.cpp +346 -0
  17. package/cpp/llama-adapter.h +73 -0
  18. package/cpp/llama-arch.cpp +1434 -0
  19. package/cpp/llama-arch.h +395 -0
  20. package/cpp/llama-batch.cpp +368 -0
  21. package/cpp/llama-batch.h +88 -0
  22. package/cpp/llama-chat.cpp +567 -0
  23. package/cpp/llama-chat.h +51 -0
  24. package/cpp/llama-context.cpp +1771 -0
  25. package/cpp/llama-context.h +128 -0
  26. package/cpp/llama-cparams.cpp +1 -0
  27. package/cpp/llama-cparams.h +37 -0
  28. package/cpp/llama-cpp.h +30 -0
  29. package/cpp/llama-grammar.cpp +1 -0
  30. package/cpp/llama-grammar.h +3 -1
  31. package/cpp/llama-hparams.cpp +71 -0
  32. package/cpp/llama-hparams.h +140 -0
  33. package/cpp/llama-impl.cpp +167 -0
  34. package/cpp/llama-impl.h +16 -136
  35. package/cpp/llama-kv-cache.cpp +718 -0
  36. package/cpp/llama-kv-cache.h +218 -0
  37. package/cpp/llama-mmap.cpp +589 -0
  38. package/cpp/llama-mmap.h +67 -0
  39. package/cpp/llama-model-loader.cpp +1011 -0
  40. package/cpp/llama-model-loader.h +158 -0
  41. package/cpp/llama-model.cpp +2202 -0
  42. package/cpp/llama-model.h +391 -0
  43. package/cpp/llama-sampling.cpp +117 -4
  44. package/cpp/llama-vocab.cpp +21 -28
  45. package/cpp/llama-vocab.h +13 -1
  46. package/cpp/llama.cpp +8437 -19421
  47. package/cpp/llama.cpp.rej +23 -0
  48. package/cpp/llama.h +31 -6
  49. package/cpp/rn-llama.hpp +39 -37
  50. package/cpp/sgemm.cpp +776 -70
  51. package/cpp/unicode.cpp +6 -0
  52. package/package.json +1 -1
@@ -9,17 +9,23 @@ include_directories(${RNLLAMA_LIB_DIR})
9
9
 
10
10
  set(
11
11
  SOURCE_FILES
12
- ${RNLLAMA_LIB_DIR}/llama-grammar.cpp
13
- ${RNLLAMA_LIB_DIR}/llama-sampling.cpp
14
- ${RNLLAMA_LIB_DIR}/llama-vocab.cpp
15
- ${RNLLAMA_LIB_DIR}/log.cpp
16
-
17
- #${RNLLAMA_LIB_DIR}/amx/amx.cpp
18
- #${RNLLAMA_LIB_DIR}/amx/mmq.cpp
19
12
 
13
+ ${RNLLAMA_LIB_DIR}/common.cpp
20
14
  ${RNLLAMA_LIB_DIR}/llama-grammar.cpp
21
15
  ${RNLLAMA_LIB_DIR}/llama-sampling.cpp
22
16
  ${RNLLAMA_LIB_DIR}/llama-vocab.cpp
17
+ ${RNLLAMA_LIB_DIR}/llama-chat.cpp
18
+ ${RNLLAMA_LIB_DIR}/llama-mmap.cpp
19
+ ${RNLLAMA_LIB_DIR}/llama-context.cpp
20
+ ${RNLLAMA_LIB_DIR}/llama-kv-cache.cpp
21
+ ${RNLLAMA_LIB_DIR}/llama-model-loader.cpp
22
+ ${RNLLAMA_LIB_DIR}/llama-model.cpp
23
+ ${RNLLAMA_LIB_DIR}/llama-batch.cpp
24
+ ${RNLLAMA_LIB_DIR}/llama-arch.cpp
25
+ ${RNLLAMA_LIB_DIR}/llama-cparams.cpp
26
+ ${RNLLAMA_LIB_DIR}/llama-hparams.cpp
27
+ ${RNLLAMA_LIB_DIR}/llama-adapter.cpp
28
+ ${RNLLAMA_LIB_DIR}/llama-impl.cpp
23
29
  ${RNLLAMA_LIB_DIR}/log.cpp
24
30
  ${RNLLAMA_LIB_DIR}/json.hpp
25
31
  ${RNLLAMA_LIB_DIR}/json-schema-to-grammar.cpp
@@ -28,6 +34,7 @@ set(
28
34
  ${RNLLAMA_LIB_DIR}/ggml-backend.cpp
29
35
  ${RNLLAMA_LIB_DIR}/ggml-backend-reg.cpp
30
36
  ${RNLLAMA_LIB_DIR}/ggml.c
37
+ ${RNLLAMA_LIB_DIR}/gguf.cpp
31
38
  ${RNLLAMA_LIB_DIR}/ggml-cpu.c
32
39
  ${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
33
40
  ${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.cpp
@@ -35,7 +42,6 @@ set(
35
42
  ${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
36
43
  ${RNLLAMA_LIB_DIR}/ggml-threading.cpp
37
44
  ${RNLLAMA_LIB_DIR}/ggml-quants.c
38
- ${RNLLAMA_LIB_DIR}/common.cpp
39
45
  ${RNLLAMA_LIB_DIR}/sampling.cpp
40
46
  ${RNLLAMA_LIB_DIR}/unicode-data.cpp
41
47
  ${RNLLAMA_LIB_DIR}/unicode.cpp
@@ -11,7 +11,8 @@
11
11
  #include <unordered_map>
12
12
  #include "llama.h"
13
13
  #include "llama-impl.h"
14
- #include "ggml.h"
14
+ #include "llama-context.h"
15
+ #include "gguf.h"
15
16
  #include "rn-llama.hpp"
16
17
 
17
18
  #define UNUSED(x) (void)(x)
@@ -336,17 +337,17 @@ Java_com_rnllama_LlamaContext_initContext(
336
337
 
337
338
  LOGI("[RNLlama] is_model_loaded %s", (is_model_loaded ? "true" : "false"));
338
339
  if (is_model_loaded) {
339
- if (embedding && llama_model_has_encoder(llama->model) && llama_model_has_decoder(llama->model)) {
340
+ if (embedding && llama_model_has_encoder(llama->model.get()) && llama_model_has_decoder(llama->model.get())) {
340
341
  LOGI("[RNLlama] computing embeddings in encoder-decoder models is not supported");
341
- llama_free(llama->ctx);
342
+ llama_free(llama->ctx.get());
342
343
  return -1;
343
344
  }
344
- context_map[(long) llama->ctx] = llama;
345
+ context_map[(long) llama->ctx.get()] = llama;
345
346
  } else {
346
- llama_free(llama->ctx);
347
+ llama_free(llama->ctx.get());
347
348
  }
348
349
 
349
- return reinterpret_cast<jlong>(llama->ctx);
350
+ return reinterpret_cast<jlong>(llama->ctx.get());
350
351
  }
351
352
 
352
353
 
@@ -372,13 +373,13 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
372
373
  UNUSED(thiz);
373
374
  auto llama = context_map[(long) context_ptr];
374
375
 
375
- int count = llama_model_meta_count(llama->model);
376
+ int count = llama_model_meta_count(llama->model.get());
376
377
  auto meta = createWriteableMap(env);
377
378
  for (int i = 0; i < count; i++) {
378
379
  char key[256];
379
- llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
380
+ llama_model_meta_key_by_index(llama->model.get(), i, key, sizeof(key));
380
381
  char val[2048];
381
- llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
382
+ llama_model_meta_val_str_by_index(llama->model.get(), i, val, sizeof(val));
382
383
 
383
384
  putString(env, meta, key, val);
384
385
  }
@@ -386,10 +387,10 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
386
387
  auto result = createWriteableMap(env);
387
388
 
388
389
  char desc[1024];
389
- llama_model_desc(llama->model, desc, sizeof(desc));
390
+ llama_model_desc(llama->model.get(), desc, sizeof(desc));
390
391
  putString(env, result, "desc", desc);
391
- putDouble(env, result, "size", llama_model_size(llama->model));
392
- putDouble(env, result, "nParams", llama_model_n_params(llama->model));
392
+ putDouble(env, result, "size", llama_model_size(llama->model.get()));
393
+ putDouble(env, result, "nParams", llama_model_n_params(llama->model.get()));
393
394
  putBoolean(env, result, "isChatTemplateSupported", llama->validateModelChatTemplate());
394
395
  putMap(env, result, "metadata", meta);
395
396
 
@@ -431,7 +432,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
431
432
  }
432
433
 
433
434
  const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
434
- std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
435
+ std::string formatted_chat = common_chat_apply_template(llama->model.get(), tmpl_chars, chat, true);
435
436
 
436
437
  return env->NewStringUTF(formatted_chat.c_str());
437
438
  }
@@ -450,7 +451,7 @@ Java_com_rnllama_LlamaContext_loadSession(
450
451
  auto result = createWriteableMap(env);
451
452
  size_t n_token_count_out = 0;
452
453
  llama->embd.resize(llama->params.n_ctx);
453
- if (!llama_state_load_file(llama->ctx, path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
454
+ if (!llama_state_load_file(llama->ctx.get(), path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
454
455
  env->ReleaseStringUTFChars(path, path_chars);
455
456
 
456
457
  putString(env, result, "error", "Failed to load session");
@@ -459,7 +460,7 @@ Java_com_rnllama_LlamaContext_loadSession(
459
460
  llama->embd.resize(n_token_count_out);
460
461
  env->ReleaseStringUTFChars(path, path_chars);
461
462
 
462
- const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
463
+ const std::string text = rnllama::tokens_to_str(llama->ctx.get(), llama->embd.cbegin(), llama->embd.cend());
463
464
  putInt(env, result, "tokens_loaded", n_token_count_out);
464
465
  putString(env, result, "prompt", text.c_str());
465
466
  return reinterpret_cast<jobject>(result);
@@ -481,7 +482,7 @@ Java_com_rnllama_LlamaContext_saveSession(
481
482
  std::vector<llama_token> session_tokens = llama->embd;
482
483
  int default_size = session_tokens.size();
483
484
  int save_size = size > 0 && size <= default_size ? size : default_size;
484
- if (!llama_state_save_file(llama->ctx, path_chars, session_tokens.data(), save_size)) {
485
+ if (!llama_state_save_file(llama->ctx.get(), path_chars, session_tokens.data(), save_size)) {
485
486
  env->ReleaseStringUTFChars(path, path_chars);
486
487
  return -1;
487
488
  }
@@ -499,13 +500,13 @@ static inline jobject tokenProbsToMap(
499
500
  for (const auto &prob : probs) {
500
501
  auto probsForToken = createWritableArray(env);
501
502
  for (const auto &p : prob.probs) {
502
- std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, p.tok);
503
+ std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), p.tok);
503
504
  auto probResult = createWriteableMap(env);
504
505
  putString(env, probResult, "tok_str", tokStr.c_str());
505
506
  putDouble(env, probResult, "prob", p.prob);
506
507
  pushMap(env, probsForToken, probResult);
507
508
  }
508
- std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, prob.tok);
509
+ std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), prob.tok);
509
510
  auto tokenResult = createWriteableMap(env);
510
511
  putString(env, tokenResult, "content", tokStr.c_str());
511
512
  putArray(env, tokenResult, "probs", probsForToken);
@@ -555,7 +556,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
555
556
 
556
557
  llama->rewind();
557
558
 
558
- //llama_reset_timings(llama->ctx);
559
+ //llama_reset_timings(llama->ctx.get());
559
560
 
560
561
  llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
561
562
  llama->params.sampling.seed = (seed == -1) ? time(NULL) : seed;
@@ -593,7 +594,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
593
594
 
594
595
  sparams.logit_bias.clear();
595
596
  if (ignore_eos) {
596
- sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
597
+ sparams.logit_bias[llama_token_eos(llama->model.get())].bias = -INFINITY;
597
598
  }
598
599
 
599
600
  // dry break seq
@@ -612,7 +613,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
612
613
  sparams.dry_sequence_breakers = dry_sequence_breakers_vector;
613
614
 
614
615
  // logit bias
615
- const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
616
+ const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx.get()));
616
617
  jsize logit_bias_len = env->GetArrayLength(logit_bias);
617
618
 
618
619
  for (jsize i = 0; i < logit_bias_len; i++) {
@@ -659,7 +660,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
659
660
  if (token_with_probs.tok == -1 || llama->incomplete) {
660
661
  continue;
661
662
  }
662
- const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
663
+ const std::string token_text = common_token_to_piece(llama->ctx.get(), token_with_probs.tok);
663
664
 
664
665
  size_t pos = std::min(sent_count, llama->generated_text.size());
665
666
 
@@ -694,7 +695,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
694
695
  putString(env, tokenResult, "token", to_send.c_str());
695
696
 
696
697
  if (llama->params.sampling.n_probs > 0) {
697
- const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
698
+ const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx.get(), to_send, false);
698
699
  size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
699
700
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
700
701
  if (probs_pos < probs_stop_pos) {
@@ -711,7 +712,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
711
712
  }
712
713
  }
713
714
 
714
- llama_perf_context_print(llama->ctx);
715
+ llama_perf_context_print(llama->ctx.get());
715
716
  llama->is_predicting = false;
716
717
 
717
718
  auto result = createWriteableMap(env);
@@ -726,7 +727,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
726
727
  putString(env, result, "stopping_word", llama->stopping_word.c_str());
727
728
  putInt(env, result, "tokens_cached", llama->n_past);
728
729
 
729
- const auto timings_token = llama_perf_context(llama -> ctx);
730
+ const auto timings_token = llama_perf_context(llama -> ctx.get());
730
731
 
731
732
  auto timingsResult = createWriteableMap(env);
732
733
  putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
@@ -770,7 +771,7 @@ Java_com_rnllama_LlamaContext_tokenize(
770
771
  const char *text_chars = env->GetStringUTFChars(text, nullptr);
771
772
 
772
773
  const std::vector<llama_token> toks = common_tokenize(
773
- llama->ctx,
774
+ llama->ctx.get(),
774
775
  text_chars,
775
776
  false
776
777
  );
@@ -797,7 +798,7 @@ Java_com_rnllama_LlamaContext_detokenize(
797
798
  toks.push_back(tokens_ptr[i]);
798
799
  }
799
800
 
800
- auto text = rnllama::tokens_to_str(llama->ctx, toks.cbegin(), toks.cend());
801
+ auto text = rnllama::tokens_to_str(llama->ctx.get(), toks.cbegin(), toks.cend());
801
802
 
802
803
  env->ReleaseIntArrayElements(tokens, tokens_ptr, 0);
803
804
 
@@ -834,7 +835,7 @@ Java_com_rnllama_LlamaContext_embedding(
834
835
 
835
836
  llama->rewind();
836
837
 
837
- llama_perf_context_reset(llama->ctx);
838
+ llama_perf_context_reset(llama->ctx.get());
838
839
 
839
840
  llama->params.prompt = text_chars;
840
841
 
@@ -860,7 +861,7 @@ Java_com_rnllama_LlamaContext_embedding(
860
861
 
861
862
  auto promptTokens = createWritableArray(env);
862
863
  for (const auto &tok : llama->embd) {
863
- pushString(env, promptTokens, common_token_to_piece(llama->ctx, tok).c_str());
864
+ pushString(env, promptTokens, common_token_to_piece(llama->ctx.get(), tok).c_str());
864
865
  }
865
866
  putArray(env, result, "prompt_tokens", promptTokens);
866
867
 
@@ -890,17 +891,17 @@ Java_com_rnllama_LlamaContext_freeContext(
890
891
  UNUSED(env);
891
892
  UNUSED(thiz);
892
893
  auto llama = context_map[(long) context_ptr];
893
- if (llama->model) {
894
- llama_free_model(llama->model);
894
+ if (llama->model.get()) {
895
+ llama_model_free(llama->model.get());
895
896
  }
896
- if (llama->ctx) {
897
- llama_free(llama->ctx);
897
+ if (llama->ctx.get()) {
898
+ llama_free(llama->ctx.get());
898
899
  }
899
- if (llama->ctx_sampling != nullptr)
900
+ /*if (llama->ctx.get()-> != nullptr)
900
901
  {
901
- common_sampler_free(llama->ctx_sampling);
902
- }
903
- context_map.erase((long) llama->ctx);
902
+ common_sampler_free(llama->ctx.get() -> _sampling);
903
+ }*/
904
+ context_map.erase((long) llama->ctx.get());
904
905
  }
905
906
 
906
907
  JNIEXPORT void JNICALL
package/cpp/common.cpp CHANGED
@@ -2,6 +2,9 @@
2
2
  #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
3
  #endif
4
4
 
5
+ #include "ggml.h"
6
+ #include "gguf.h"
7
+
5
8
  #include "common.h"
6
9
  #include "log.h"
7
10
  // Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
@@ -18,6 +21,7 @@
18
21
  #include <cstdarg>
19
22
  #include <cstring>
20
23
  #include <ctime>
24
+ #include <filesystem>
21
25
  #include <fstream>
22
26
  #include <iostream>
23
27
  #include <iterator>
@@ -68,7 +72,9 @@ char const *LLAMA_BUILD_TARGET = "unknown";
68
72
  #ifdef __linux__
69
73
  #include <linux/limits.h>
70
74
  #elif defined(_WIN32)
71
- #define PATH_MAX MAX_PATH
75
+ # if !defined(PATH_MAX)
76
+ # define PATH_MAX MAX_PATH
77
+ # endif
72
78
  #else
73
79
  #include <sys/syslimits.h>
74
80
  #endif
@@ -849,7 +855,7 @@ struct common_init_result common_init_from_params(common_params & params) {
849
855
  } else if (!params.model_url.empty()) {
850
856
  model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
851
857
  } else {
852
- model = llama_load_model_from_file(params.model.c_str(), mparams);
858
+ model = llama_model_load_from_file(params.model.c_str(), mparams);
853
859
  }
854
860
 
855
861
  if (model == NULL) {
@@ -876,7 +882,7 @@ struct common_init_result common_init_from_params(common_params & params) {
876
882
  }
877
883
 
878
884
  if (!ok) {
879
- llama_free_model(model);
885
+ llama_model_free(model);
880
886
 
881
887
  return iparams;
882
888
  }
@@ -887,14 +893,13 @@ struct common_init_result common_init_from_params(common_params & params) {
887
893
  llama_context * lctx = llama_new_context_with_model(model, cparams);
888
894
  if (lctx == NULL) {
889
895
  LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
890
- llama_free_model(model);
896
+ llama_model_free(model);
891
897
  return iparams;
892
898
  }
893
899
 
894
900
  if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
895
- LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
896
- llama_free_model(model);
897
- return iparams;
901
+ LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
902
+ params.ctx_shift = false;
898
903
  }
899
904
 
900
905
  if (!params.control_vectors.empty()) {
@@ -904,7 +909,7 @@ struct common_init_result common_init_from_params(common_params & params) {
904
909
  const auto cvec = common_control_vector_load(params.control_vectors);
905
910
  if (cvec.n_embd == -1) {
906
911
  llama_free(lctx);
907
- llama_free_model(model);
912
+ llama_model_free(model);
908
913
 
909
914
  return iparams;
910
915
  }
@@ -917,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
917
922
  params.control_vector_layer_end);
918
923
  if (err) {
919
924
  llama_free(lctx);
920
- llama_free_model(model);
925
+ llama_model_free(model);
921
926
 
922
927
  return iparams;
923
928
  }
@@ -925,20 +930,21 @@ struct common_init_result common_init_from_params(common_params & params) {
925
930
 
926
931
  // load and optionally apply lora adapters
927
932
  for (auto & la : params.lora_adapters) {
928
- common_lora_adapter_container loaded_la;
929
- loaded_la.path = la.path;
930
- loaded_la.scale = la.scale;
931
- loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
932
- if (loaded_la.adapter == nullptr) {
933
+ llama_lora_adapter_ptr lora;
934
+ lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
935
+ if (lora == nullptr) {
933
936
  LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
934
937
  llama_free(lctx);
935
- llama_free_model(model);
938
+ llama_model_free(model);
936
939
  return iparams;
937
940
  }
938
- iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
941
+
942
+ la.ptr = lora.get();
943
+ iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
939
944
  }
945
+
940
946
  if (!params.lora_init_without_apply) {
941
- common_lora_adapters_apply(lctx, iparams.lora_adapters);
947
+ common_lora_adapters_apply(lctx, params.lora_adapters);
942
948
  }
943
949
 
944
950
  if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -985,7 +991,7 @@ struct common_init_result common_init_from_params(common_params & params) {
985
991
  if (llama_model_has_encoder(model)) {
986
992
  llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
987
993
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
988
- if (decoder_start_token_id == -1) {
994
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
989
995
  decoder_start_token_id = bos;
990
996
  }
991
997
  tmp.clear();
@@ -999,17 +1005,17 @@ struct common_init_result common_init_from_params(common_params & params) {
999
1005
  llama_perf_context_reset(lctx);
1000
1006
  }
1001
1007
 
1002
- iparams.model = model;
1003
- iparams.context = lctx;
1008
+ iparams.model.reset(model);
1009
+ iparams.context.reset(lctx);
1004
1010
 
1005
1011
  return iparams;
1006
1012
  }
1007
1013
 
1008
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
1014
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
1009
1015
  llama_lora_adapter_clear(ctx);
1010
- for (auto & la : lora_adapters) {
1016
+ for (auto & la : lora) {
1011
1017
  if (la.scale != 0.0f) {
1012
- llama_lora_adapter_set(ctx, la.adapter, la.scale);
1018
+ llama_lora_adapter_set(ctx, la.ptr, la.scale);
1013
1019
  }
1014
1020
  }
1015
1021
  }
@@ -1158,8 +1164,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
1158
1164
  #endif
1159
1165
 
1160
1166
  // Check if the file already exists locally
1161
- struct stat model_file_info;
1162
- auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
1167
+ auto file_exists = std::filesystem::exists(path);
1163
1168
 
1164
1169
  // If the file exists, check its JSON metadata companion file.
1165
1170
  std::string metadata_path = path + ".json";
@@ -1419,7 +1424,7 @@ struct llama_model * common_load_model_from_url(
1419
1424
  }
1420
1425
  }
1421
1426
 
1422
- return llama_load_model_from_file(local_path.c_str(), params);
1427
+ return llama_model_load_from_file(local_path.c_str(), params);
1423
1428
  }
1424
1429
 
1425
1430
  struct llama_model * common_load_model_from_hf(
@@ -1622,6 +1627,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
1622
1627
  // Chat template utils
1623
1628
  //
1624
1629
 
1630
+ std::string common_get_builtin_chat_template(const struct llama_model * model) {
1631
+ static const char * template_key = "tokenizer.chat_template";
1632
+ // call with NULL buffer to get the total size of the string
1633
+ int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
1634
+ if (res > 0) {
1635
+ std::vector<char> model_template(res + 1, 0);
1636
+ llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
1637
+ return std::string(model_template.data(), model_template.size() - 1);
1638
+ }
1639
+ return "";
1640
+ }
1641
+
1625
1642
  bool common_chat_verify_template(const std::string & tmpl) {
1626
1643
  llama_chat_message chat[] = {{"user", "test"}};
1627
1644
  int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
package/cpp/common.h CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  #pragma once
4
4
 
5
- #include "llama.h"
5
+ #include "llama-cpp.h"
6
6
 
7
7
  #include <string>
8
8
  #include <vector>
@@ -27,10 +27,8 @@
27
27
  struct common_lora_adapter_info {
28
28
  std::string path;
29
29
  float scale;
30
- };
31
30
 
32
- struct common_lora_adapter_container : common_lora_adapter_info {
33
- struct llama_lora_adapter * adapter;
31
+ struct llama_lora_adapter * ptr;
34
32
  };
35
33
 
36
34
  using llama_tokens = std::vector<llama_token>;
@@ -493,10 +491,12 @@ std::string fs_get_cache_file(const std::string & filename);
493
491
  // Model utils
494
492
  //
495
493
 
494
+ // note: defines object's lifetime
496
495
  struct common_init_result {
497
- struct llama_model * model = nullptr;
498
- struct llama_context * context = nullptr;
499
- std::vector<common_lora_adapter_container> lora_adapters;
496
+ llama_model_ptr model;
497
+ llama_context_ptr context;
498
+
499
+ std::vector<llama_lora_adapter_ptr> lora;
500
500
  };
501
501
 
502
502
  struct common_init_result common_init_from_params(common_params & params);
@@ -518,7 +518,7 @@ struct llama_model * common_load_model_from_hf(
518
518
  const struct llama_model_params & params);
519
519
 
520
520
  // clear LoRA adapters from context, then apply new list of adapters
521
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
521
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
522
522
 
523
523
  //
524
524
  // Batch utils
@@ -586,6 +586,9 @@ struct common_chat_msg {
586
586
  std::string content;
587
587
  };
588
588
 
589
+ // Get the built-in chat template for the model. Return empty string if not present.
590
+ std::string common_get_builtin_chat_template(const struct llama_model * model);
591
+
589
592
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
590
593
  bool common_chat_verify_template(const std::string & tmpl);
591
594
 
@@ -652,6 +655,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
652
655
  // Split utils
653
656
  //
654
657
 
655
- static const char * const LLM_KV_SPLIT_NO = "split.no";
656
- static const char * const LLM_KV_SPLIT_COUNT = "split.count";
657
- static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
658
+ namespace {
659
+
660
+ const char * const LLM_KV_SPLIT_NO = "split.no";
661
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
662
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
663
+
664
+ }
@@ -574,4 +574,9 @@ void lm_ggml_backend_load_all_from_path(const char * dir_path) {
574
574
  lm_ggml_backend_load_best("opencl", silent, dir_path);
575
575
  lm_ggml_backend_load_best("musa", silent, dir_path);
576
576
  lm_ggml_backend_load_best("cpu", silent, dir_path);
577
+ // check the environment variable LM_GGML_BACKEND_PATH to load an out-of-tree backend
578
+ const char * backend_path = std::getenv("LM_GGML_BACKEND_PATH");
579
+ if (backend_path) {
580
+ lm_ggml_backend_load(backend_path);
581
+ }
577
582
  }
@@ -764,7 +764,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
764
764
  if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
765
765
  int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
766
766
  // check if a backend with higher prio wants to offload the op
767
- if (src_backend_id == sched->n_backends - 1) {
767
+ if (src_backend_id == sched->n_backends - 1 && lm_ggml_backend_buffer_is_host(src->buffer)) {
768
768
  for (int b = 0; b < src_backend_id; b++) {
769
769
  if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
770
770
  SET_CAUSE(tensor, "1.off");
@@ -795,9 +795,12 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
795
795
  for (int i = 0; i < graph->n_nodes; i++) {
796
796
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
797
797
  lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
798
- LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
798
+ LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, lm_ggml_backend_name(split_backend),
799
799
  sched->splits[cur_split].n_inputs);
800
800
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
801
+ if (j == 0) {
802
+ LM_GGML_LOG_DEBUG(": ");
803
+ }
801
804
  LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
802
805
  fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
803
806
  }
package/cpp/ggml-cpp.h CHANGED
@@ -7,6 +7,7 @@
7
7
  #include "ggml.h"
8
8
  #include "ggml-alloc.h"
9
9
  #include "ggml-backend.h"
10
+ #include "gguf.h"
10
11
  #include <memory>
11
12
 
12
13
  // Smart pointers for ggml types
@@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
194
194
  }
195
195
 
196
196
  static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
197
- #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
197
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
198
198
  const __m256i zero = _mm256_setzero_si256();
199
199
  return _mm256_dpbusd_epi32(zero, ax, sy);
200
+ #elif defined(__AVXVNNI__)
201
+ const __m256i zero = _mm256_setzero_si256();
202
+ return _mm256_dpbusd_avx_epi32(zero, ax, sy);
200
203
  #else
201
204
  // Perform multiplication and create 16-bit values
202
205
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -4166,6 +4169,8 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_bu
4166
4169
  buffer->buft = buft;
4167
4170
  buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
4168
4171
  buffer->iface.set_tensor = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
4172
+ buffer->iface.get_tensor = nullptr;
4173
+ buffer->iface.cpy_tensor = nullptr;
4169
4174
  return buffer;
4170
4175
  }
4171
4176
 
@@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
103
103
  }
104
104
 
105
105
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
106
- #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
106
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
107
107
  const __m256i zero = _mm256_setzero_si256();
108
108
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
109
109
  return _mm256_cvtepi32_ps(summed_pairs);
110
+ #elif defined(__AVXVNNI__)
111
+ const __m256i zero = _mm256_setzero_si256();
112
+ const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
113
+ return _mm256_cvtepi32_ps(summed_pairs);
110
114
  #else
111
115
  // Perform multiplication and create 16-bit values
112
116
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
package/cpp/ggml-impl.h CHANGED
@@ -3,6 +3,8 @@
3
3
  // GGML internal header
4
4
 
5
5
  #include "ggml.h"
6
+ #include "gguf.h"
7
+
6
8
  #include <assert.h>
7
9
  #include <math.h>
8
10
  #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
@@ -551,22 +553,15 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
551
553
  #define LM_GGML_FP32_TO_BF16(x) lm_ggml_compute_fp32_to_bf16(x)
552
554
  #define LM_GGML_BF16_TO_FP32(x) lm_ggml_compute_bf16_to_fp32(x)
553
555
 
554
- // expose GGUF internals for test code
555
-
556
- LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
557
-
558
- LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
559
-
560
- struct lm_gguf_buf {
561
- void * data;
562
- size_t size;
563
- size_t offset;
564
- };
565
- LM_GGML_API struct lm_gguf_buf lm_gguf_buf_init(size_t size);
566
- LM_GGML_API void lm_gguf_buf_free(struct lm_gguf_buf buf);
567
-
568
- LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_gguf_buf * buf, bool only_meta);
569
-
570
556
  #ifdef __cplusplus
571
557
  }
572
558
  #endif
559
+
560
+ #ifdef __cplusplus
561
+ #include <vector>
562
+
563
+ // expose GGUF internals for test code
564
+ LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
565
+ LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
566
+ LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
567
+ #endif // __cplusplus
package/cpp/ggml-metal.m CHANGED
@@ -2067,8 +2067,8 @@ static void lm_ggml_metal_encode_node(
2067
2067
  LM_GGML_ASSERT(ne12 % ne02 == 0);
2068
2068
  LM_GGML_ASSERT(ne13 % ne03 == 0);
2069
2069
 
2070
- const uint r2 = ne12/ne02;
2071
- const uint r3 = ne13/ne03;
2070
+ const uint32_t r2 = ne12/ne02;
2071
+ const uint32_t r3 = ne13/ne03;
2072
2072
 
2073
2073
  // find the break-even point where the matrix-matrix kernel becomes more efficient compared
2074
2074
  // to the matrix-vector kernel