@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -87,7 +87,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
87
87
  auto path_to_model = env->GetStringUTFChars(filename, 0);
88
88
  LOGi("Loading model from %s", path_to_model);
89
89
 
90
- auto model = llama_load_model_from_file(path_to_model, model_params);
90
+ auto model = llama_model_load_from_file(path_to_model, model_params);
91
91
  env->ReleaseStringUTFChars(filename, path_to_model);
92
92
 
93
93
  if (!model) {
@@ -102,7 +102,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
102
102
  extern "C"
103
103
  JNIEXPORT void JNICALL
104
104
  Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
105
- llama_free_model(reinterpret_cast<llama_model *>(model));
105
+ llama_model_free(reinterpret_cast<llama_model *>(model));
106
106
  }
107
107
 
108
108
  extern "C"
@@ -305,7 +305,9 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
305
305
  extern "C"
306
306
  JNIEXPORT void JNICALL
307
307
  Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
308
- llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
308
+ //llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
309
+ const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
310
+ delete batch;
309
311
  }
310
312
 
311
313
  extern "C"
@@ -345,6 +347,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
345
347
  jlong context_pointer,
346
348
  jlong batch_pointer,
347
349
  jstring jtext,
350
+ jboolean format_chat,
348
351
  jint n_len
349
352
  ) {
350
353
 
@@ -354,7 +357,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
354
357
  const auto context = reinterpret_cast<llama_context *>(context_pointer);
355
358
  const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
356
359
 
357
- const auto tokens_list = common_tokenize(context, text, 1);
360
+ bool parse_special = (format_chat == JNI_TRUE);
361
+ const auto tokens_list = common_tokenize(context, text, true, parse_special);
358
362
 
359
363
  auto n_ctx = llama_n_ctx(context);
360
364
  auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@@ -366,7 +370,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
366
370
  }
367
371
 
368
372
  for (auto id : tokens_list) {
369
- LOGi("%s", common_token_to_piece(context, id).c_str());
373
+ LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
370
374
  }
371
375
 
372
376
  common_batch_clear(*batch);
@@ -403,6 +407,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
403
407
  const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
404
408
  const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
405
409
  const auto model = llama_get_model(context);
410
+ const auto vocab = llama_model_get_vocab(model);
406
411
 
407
412
  if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
408
413
  if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
@@ -412,7 +417,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
412
417
  const auto new_token_id = llama_sampler_sample(sampler, context, -1);
413
418
 
414
419
  const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
415
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
420
+ if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
416
421
  return nullptr;
417
422
  }
418
423
 
@@ -7,6 +7,7 @@
7
7
  #include "ggml-cpu.h"
8
8
  #include "ggml-alloc.h"
9
9
  #include "ggml-backend.h"
10
+ #include "gguf.h"
10
11
 
11
12
  //#ifdef GGML_USE_CUDA
12
13
  //#include "ggml-cuda.h"
@@ -262,7 +263,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
262
263
  {
263
264
  const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
264
265
  int arr_n = gguf_get_arr_n(ctx_gguf, i);
265
- const void * data = gguf_get_arr_data(ctx_gguf, i);
266
+ const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
266
267
  std::stringstream ss;
267
268
  ss << "[";
268
269
  for (int j = 0; j < arr_n; j++) {
@@ -2734,7 +2735,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2734
2735
  total_size_org += orig_size;
2735
2736
  total_size_new += new_size;
2736
2737
  gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
2737
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
2738
+ GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
2739
+ gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
2738
2740
  fout.write((const char *)new_data, new_size);
2739
2741
  size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
2740
2742
  for (size_t j = 0; j < pad; ++j) {
@@ -47,8 +47,12 @@ static const char * sample(struct common_sampler * smpl,
47
47
  int * n_past) {
48
48
  const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
49
49
  common_sampler_accept(smpl, id, true);
50
+
51
+ const llama_model * model = llama_get_model(ctx_llama);
52
+ const llama_vocab * vocab = llama_model_get_vocab(model);
53
+
50
54
  static std::string ret;
51
- if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
55
+ if (llama_vocab_is_eog(vocab, id)) {
52
56
  ret = "</s>";
53
57
  } else {
54
58
  ret = common_token_to_piece(ctx_llama, id);
@@ -221,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {
221
225
 
222
226
  llama_model_params model_params = common_model_params_to_llama(*params);
223
227
 
224
- llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
228
+ llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
225
229
  if (model == NULL) {
226
230
  LOG_ERR("%s: unable to load model\n" , __func__);
227
231
  return NULL;
@@ -239,11 +243,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
239
243
 
240
244
  auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
241
245
 
242
-
243
246
  llama_context_params ctx_params = common_context_params_to_llama(*params);
244
247
  ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
245
248
 
246
- llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
249
+ llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
247
250
 
248
251
  if (ctx_llama == NULL) {
249
252
  LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@@ -265,7 +268,7 @@ static void llava_free(struct llava_context * ctx_llava) {
265
268
  }
266
269
 
267
270
  llama_free(ctx_llava->ctx_llama);
268
- llama_free_model(ctx_llava->model);
271
+ llama_model_free(ctx_llava->model);
269
272
  llama_backend_free();
270
273
  }
271
274
 
@@ -323,7 +326,7 @@ int main(int argc, char ** argv) {
323
326
  }
324
327
  }
325
328
 
326
- llama_free_model(model);
329
+ llama_model_free(model);
327
330
 
328
331
  return 0;
329
332
  }
@@ -384,7 +384,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
384
384
 
385
385
  bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
386
386
  // make sure that the correct mmproj was used, i.e., compare apples to apples
387
- int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
387
+ int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
388
388
  auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
389
389
  if (n_image_embd != n_llama_embd) {
390
390
  LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
@@ -456,7 +456,7 @@ struct llava_embd_batch {
456
456
  };
457
457
 
458
458
  bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
459
- int n_embd = llama_n_embd(llama_get_model(ctx_llama));
459
+ int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
460
460
 
461
461
  for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
462
462
  int n_eval = image_embed->n_image_pos - i;
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
31
31
 
32
32
  llama_model_params model_params = common_model_params_to_llama(*params);
33
33
 
34
- llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
34
+ llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
35
35
  if (model == NULL) {
36
36
  LOG_ERR("%s: unable to load model\n" , __func__);
37
37
  return NULL;
@@ -54,7 +54,7 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
54
54
  ctx_params.n_ctx = params->n_ctx;
55
55
  }
56
56
 
57
- llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
57
+ llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
58
58
 
59
59
  if (ctx_llama == NULL) {
60
60
  LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@@ -75,7 +75,7 @@ static void llava_free(struct llava_context * ctx_llava) {
75
75
  }
76
76
 
77
77
  llama_free(ctx_llava->ctx_llama);
78
- llama_free_model(ctx_llava->model);
78
+ llama_model_free(ctx_llava->model);
79
79
  llama_backend_free();
80
80
  }
81
81
 
@@ -167,8 +167,12 @@ static const char * sample(struct common_sampler * smpl,
167
167
  int * n_past) {
168
168
  const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
169
169
  common_sampler_accept(smpl, id, true);
170
+
171
+ const llama_model * model = llama_get_model(ctx_llama);
172
+ const llama_vocab * vocab = llama_model_get_vocab(model);
173
+
170
174
  static std::string ret;
171
- if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
175
+ if (llama_vocab_is_eog(vocab, id)) {
172
176
  ret = "</s>";
173
177
  } else {
174
178
  ret = common_token_to_piece(ctx_llama, id);
@@ -27,7 +27,7 @@
27
27
 
28
28
  static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
29
29
  int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
30
- int n_embd = llama_n_embd(llama_get_model(ctx_llama));
30
+ int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
31
31
  const int patch_size = 14 * 2;
32
32
  const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
33
33
  const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
@@ -132,8 +132,12 @@ static const char * sample(struct common_sampler * smpl,
132
132
  int * n_past, int * st_pos_id) {
133
133
  const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
134
134
  common_sampler_accept(smpl, id, true);
135
+
136
+ const llama_model * model = llama_get_model(ctx_llama);
137
+ const llama_vocab * vocab = llama_model_get_vocab(model);
138
+
135
139
  static std::string ret;
136
- if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
140
+ if (llama_vocab_is_eog(vocab, id)) {
137
141
  ret = "</s>";
138
142
  } else {
139
143
  ret = common_token_to_piece(ctx_llama, id);
@@ -310,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {
310
314
 
311
315
  llama_model_params model_params = common_model_params_to_llama(*params);
312
316
 
313
- llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
317
+ llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
314
318
  if (model == NULL) {
315
319
  LOG_ERR("%s: unable to load model\n" , __func__);
316
320
  return NULL;
@@ -328,11 +332,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
328
332
 
329
333
  auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
330
334
 
331
-
332
335
  llama_context_params ctx_params = common_context_params_to_llama(*params);
333
336
  ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
334
337
 
335
- llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
338
+ llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
336
339
 
337
340
  if (ctx_llama == NULL) {
338
341
  LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@@ -354,7 +357,7 @@ static void llava_free(struct llava_context * ctx_llava) {
354
357
  }
355
358
 
356
359
  llama_free(ctx_llava->ctx_llama);
357
- llama_free_model(ctx_llava->model);
360
+ llama_model_free(ctx_llava->model);
358
361
  llama_backend_free();
359
362
  }
360
363
 
@@ -481,7 +484,7 @@ static void debug_test_mrope_2d() {
481
484
  }
482
485
 
483
486
  static void debug_dump_img_embed(struct llava_context * ctx_llava) {
484
- int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
487
+ int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
485
488
  int ne = n_embd * 4;
486
489
  float vals[56 * 56 * 3];
487
490
  // float embd[ne];
@@ -575,7 +578,7 @@ int main(int argc, char ** argv) {
575
578
  }
576
579
  }
577
580
 
578
- llama_free_model(model);
581
+ llama_model_free(model);
579
582
 
580
583
  return 0;
581
584
  }
@@ -58,8 +58,10 @@ int main(int argc, char ** argv) {
58
58
  // load the target model
59
59
  common_init_result llama_init = common_init_from_params(params);
60
60
 
61
- llama_model * model = llama_init.model;
62
- llama_context * ctx = llama_init.context;
61
+ llama_model * model = llama_init.model.get();
62
+ llama_context * ctx = llama_init.context.get();
63
+
64
+ const llama_vocab * vocab = llama_model_get_vocab(model);
63
65
 
64
66
  // Tokenize the prompt
65
67
  std::vector<llama_token> inp;
@@ -147,7 +149,7 @@ int main(int argc, char ** argv) {
147
149
  }
148
150
 
149
151
  // here we keep adding new n-grams as we go
150
- ngram_container ngrams_observed(llama_n_vocab(model), N, G);
152
+ ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
151
153
 
152
154
  // debug
153
155
  struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
@@ -297,7 +299,7 @@ int main(int argc, char ** argv) {
297
299
  }
298
300
  fflush(stdout);
299
301
 
300
- if (llama_token_is_eog(model, id)) {
302
+ if (llama_vocab_is_eog(vocab, id)) {
301
303
  has_eos = true;
302
304
  }
303
305
 
@@ -474,9 +476,6 @@ int main(int argc, char ** argv) {
474
476
 
475
477
  llama_batch_free(batch);
476
478
 
477
- llama_free(ctx);
478
- llama_free_model(model);
479
-
480
479
  llama_backend_free();
481
480
 
482
481
  LOG("\n\n");
@@ -1,14 +1,9 @@
1
1
  #include "arg.h"
2
2
  #include "common.h"
3
3
  #include "ngram-cache.h"
4
- #include "ggml.h"
5
4
  #include "llama.h"
6
5
 
7
- #include <cstdint>
8
- #include <fstream>
9
- #include <iostream>
10
6
  #include <string>
11
- #include <unordered_map>
12
7
  #include <vector>
13
8
 
14
9
  int main(int argc, char ** argv){
@@ -25,16 +20,16 @@ int main(int argc, char ** argv){
25
20
  // load the model
26
21
  common_init_result llama_init = common_init_from_params(params);
27
22
 
28
- llama_model * model = llama_init.model;
29
- llama_context * ctx = llama_init.context;
23
+ llama_model_ptr & model = llama_init.model;
24
+ llama_context_ptr & ctx = llama_init.context;
25
+
30
26
  GGML_ASSERT(model != nullptr);
31
27
 
32
28
  // tokenize the prompt
33
29
  std::vector<llama_token> inp;
34
- inp = common_tokenize(ctx, params.prompt, true, true);
30
+ inp = common_tokenize(ctx.get(), params.prompt, true, true);
35
31
  fprintf(stderr, "%s: tokenization done\n", __func__);
36
32
 
37
-
38
33
  common_ngram_cache ngram_cache;
39
34
  common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
40
35
  fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
@@ -30,12 +30,11 @@ int main(int argc, char ** argv){
30
30
  // load the model
31
31
  common_init_result llama_init = common_init_from_params(params);
32
32
 
33
- llama_model * model = llama_init.model;
34
- llama_context * ctx = llama_init.context;
33
+ llama_context_ptr & ctx = llama_init.context;
35
34
 
36
35
  // tokenize the prompt
37
36
  std::vector<llama_token> inp;
38
- inp = common_tokenize(ctx, params.prompt, true, true);
37
+ inp = common_tokenize(ctx.get(), params.prompt, true, true);
39
38
 
40
39
  common_ngram_cache ngram_cache_context;
41
40
  common_ngram_cache ngram_cache_dynamic;
@@ -66,7 +65,7 @@ int main(int argc, char ** argv){
66
65
  }
67
66
 
68
67
  const int n_input = inp.size();
69
- const int n_ctx = llama_n_ctx(ctx);
68
+ const int n_ctx = llama_n_ctx(ctx.get());
70
69
 
71
70
  int n_drafted = 0;
72
71
  int n_accept = 0;
@@ -150,9 +149,6 @@ int main(int argc, char ** argv){
150
149
  LOG_INF("n_accept = %d\n", n_accept);
151
150
  LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
152
151
 
153
- llama_free(ctx);
154
- llama_free_model(model);
155
-
156
152
  llama_backend_free();
157
153
 
158
154
  LOG("\n\n");
@@ -33,8 +33,10 @@ int main(int argc, char ** argv){
33
33
  // load the model
34
34
  common_init_result llama_init = common_init_from_params(params);
35
35
 
36
- llama_model * model = llama_init.model;
37
- llama_context * ctx = llama_init.context;
36
+ llama_model * model = llama_init.model.get();
37
+ llama_context * ctx = llama_init.context.get();
38
+
39
+ const llama_vocab * vocab = llama_model_get_vocab(model);
38
40
 
39
41
  // tokenize the prompt
40
42
  std::vector<llama_token> inp;
@@ -136,7 +138,7 @@ int main(int argc, char ** argv){
136
138
  LOG("%s", token_str.c_str());
137
139
  }
138
140
 
139
- if (llama_token_is_eog(model, id)) {
141
+ if (llama_vocab_is_eog(vocab, id)) {
140
142
  has_eos = true;
141
143
  }
142
144
 
@@ -243,9 +245,6 @@ int main(int argc, char ** argv){
243
245
 
244
246
  llama_batch_free(batch_tgt);
245
247
 
246
- llama_free(ctx);
247
- llama_free_model(model);
248
-
249
248
  llama_backend_free();
250
249
 
251
250
  LOG("\n\n");
@@ -5,7 +5,6 @@
5
5
  #include "sampling.h"
6
6
  #include "llama.h"
7
7
 
8
- #include <cassert>
9
8
  #include <cstdio>
10
9
  #include <cstring>
11
10
  #include <ctime>
@@ -31,6 +30,8 @@
31
30
  #pragma warning(disable: 4244 4267) // possible loss of data
32
31
  #endif
33
32
 
33
+ static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
34
+
34
35
  static llama_context ** g_ctx;
35
36
  static llama_model ** g_model;
36
37
  static common_sampler ** g_smpl;
@@ -145,24 +146,26 @@ int main(int argc, char ** argv) {
145
146
  llama_context * ctx = nullptr;
146
147
  common_sampler * smpl = nullptr;
147
148
 
148
- std::vector<common_chat_msg> chat_msgs;
149
-
150
149
  g_model = &model;
151
150
  g_ctx = &ctx;
152
151
  g_smpl = &smpl;
153
152
 
153
+ std::vector<common_chat_msg> chat_msgs;
154
+
154
155
  // load the model and apply lora adapter, if any
155
156
  LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
156
157
  common_init_result llama_init = common_init_from_params(params);
157
158
 
158
- model = llama_init.model;
159
- ctx = llama_init.context;
159
+ model = llama_init.model.get();
160
+ ctx = llama_init.context.get();
160
161
 
161
162
  if (model == NULL) {
162
163
  LOG_ERR("%s: error: unable to load model\n", __func__);
163
164
  return 1;
164
165
  }
165
166
 
167
+ const llama_vocab * vocab = llama_model_get_vocab(model);
168
+
166
169
  LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
167
170
 
168
171
  auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
@@ -196,15 +199,31 @@ int main(int argc, char ** argv) {
196
199
 
197
200
  llama_attach_threadpool(ctx, threadpool, threadpool_batch);
198
201
 
199
- const int n_ctx_train = llama_n_ctx_train(model);
202
+ const int n_ctx_train = llama_model_n_ctx_train(model);
200
203
  const int n_ctx = llama_n_ctx(ctx);
201
204
 
202
205
  if (n_ctx > n_ctx_train) {
203
206
  LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
204
207
  }
205
208
 
209
+ // auto enable conversation mode if chat template is available
210
+ const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
211
+ if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
212
+ if (has_chat_template) {
213
+ LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
214
+ params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
215
+ } else {
216
+ params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
217
+ }
218
+ }
219
+
220
+ // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
221
+ if (params.conversation_mode && !has_chat_template) {
222
+ LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
223
+ }
224
+
206
225
  // print chat template example in conversation mode
207
- if (params.conversation) {
226
+ if (params.conversation_mode) {
208
227
  if (params.enable_chat_template) {
209
228
  LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
210
229
  } else {
@@ -241,9 +260,9 @@ int main(int argc, char ** argv) {
241
260
  }
242
261
  }
243
262
 
244
- const bool add_bos = llama_add_bos_token(model);
263
+ const bool add_bos = llama_vocab_get_add_bos(vocab);
245
264
  if (!llama_model_has_encoder(model)) {
246
- GGML_ASSERT(!llama_add_eos_token(model));
265
+ GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
247
266
  }
248
267
 
249
268
  LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
@@ -251,8 +270,10 @@ int main(int argc, char ** argv) {
251
270
  std::vector<llama_token> embd_inp;
252
271
 
253
272
  {
254
- auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
255
- ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
273
+ auto prompt = (params.conversation_mode && params.enable_chat_template)
274
+ // format the system prompt in conversation mode (fallback to default if empty)
275
+ ? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
276
+ // otherwise use the prompt as is
256
277
  : params.prompt;
257
278
  if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
258
279
  LOG_DBG("tokenize the prompt\n");
@@ -269,7 +290,7 @@ int main(int argc, char ** argv) {
269
290
  // Should not run without any tokens
270
291
  if (embd_inp.empty()) {
271
292
  if (add_bos) {
272
- embd_inp.push_back(llama_token_bos(model));
293
+ embd_inp.push_back(llama_vocab_bos(vocab));
273
294
  LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
274
295
  } else {
275
296
  LOG_ERR("input is empty\n");
@@ -326,7 +347,7 @@ int main(int argc, char ** argv) {
326
347
  params.n_keep += add_bos; // always keep the BOS token
327
348
  }
328
349
 
329
- if (params.conversation) {
350
+ if (params.conversation_mode) {
330
351
  params.interactive_first = true;
331
352
  }
332
353
 
@@ -450,7 +471,11 @@ int main(int argc, char ** argv) {
450
471
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
451
472
  LOG_INF( " - Press Ctrl+C to interject at any time.\n");
452
473
  #endif
453
- LOG_INF( "%s\n", control_message);
474
+ LOG_INF( "%s", control_message);
475
+ if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
476
+ LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
477
+ }
478
+ LOG_INF("\n");
454
479
 
455
480
  is_interacting = params.interactive_first;
456
481
  }
@@ -494,8 +519,8 @@ int main(int argc, char ** argv) {
494
519
  }
495
520
 
496
521
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
497
- if (decoder_start_token_id == -1) {
498
- decoder_start_token_id = llama_token_bos(model);
522
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
523
+ decoder_start_token_id = llama_vocab_bos(vocab);
499
524
  }
500
525
 
501
526
  embd_inp.clear();
@@ -742,7 +767,7 @@ int main(int argc, char ** argv) {
742
767
  }
743
768
 
744
769
  // deal with end of generation tokens in interactive mode
745
- if (llama_token_is_eog(model, common_sampler_last(smpl))) {
770
+ if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
746
771
  LOG_DBG("found an EOG token\n");
747
772
 
748
773
  if (params.interactive) {
@@ -762,7 +787,7 @@ int main(int argc, char ** argv) {
762
787
  }
763
788
 
764
789
  // if current token is not EOG, we add it to current assistant message
765
- if (params.conversation) {
790
+ if (params.conversation_mode) {
766
791
  const auto id = common_sampler_last(smpl);
767
792
  assistant_ss << common_token_to_piece(ctx, id, false);
768
793
  }
@@ -770,17 +795,17 @@ int main(int argc, char ** argv) {
770
795
  if (n_past > 0 && is_interacting) {
771
796
  LOG_DBG("waiting for user input\n");
772
797
 
773
- if (params.conversation) {
798
+ if (params.conversation_mode) {
774
799
  LOG("\n> ");
775
800
  }
776
801
 
777
802
  if (params.input_prefix_bos) {
778
803
  LOG_DBG("adding input prefix BOS token\n");
779
- embd_inp.push_back(llama_token_bos(model));
804
+ embd_inp.push_back(llama_vocab_bos(vocab));
780
805
  }
781
806
 
782
807
  std::string buffer;
783
- if (!params.input_prefix.empty() && !params.conversation) {
808
+ if (!params.input_prefix.empty() && !params.conversation_mode) {
784
809
  LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
785
810
  LOG("%s", params.input_prefix.c_str());
786
811
  }
@@ -804,7 +829,7 @@ int main(int argc, char ** argv) {
804
829
  // Entering a empty line lets the user pass control back
805
830
  if (buffer.length() > 1) {
806
831
  // append input suffix if any
807
- if (!params.input_suffix.empty() && !params.conversation) {
832
+ if (!params.input_suffix.empty() && !params.conversation_mode) {
808
833
  LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
809
834
  LOG("%s", params.input_suffix.c_str());
810
835
  }
@@ -817,7 +842,7 @@ int main(int argc, char ** argv) {
817
842
  string_process_escapes(buffer);
818
843
  }
819
844
 
820
- bool format_chat = params.conversation && params.enable_chat_template;
845
+ bool format_chat = params.conversation_mode && params.enable_chat_template;
821
846
  std::string user_inp = format_chat
822
847
  ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
823
848
  : std::move(buffer);
@@ -830,8 +855,8 @@ int main(int argc, char ** argv) {
830
855
 
831
856
  // if user stop generation mid-way, we must add EOT to finish model's last response
832
857
  if (need_insert_eot && format_chat) {
833
- llama_token eot = llama_token_eot(model);
834
- embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
858
+ llama_token eot = llama_vocab_eot(vocab);
859
+ embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
835
860
  need_insert_eot = false;
836
861
  }
837
862
 
@@ -866,7 +891,7 @@ int main(int argc, char ** argv) {
866
891
  }
867
892
 
868
893
  // end of generation
869
- if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
894
+ if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
870
895
  LOG(" [end of text]\n");
871
896
  break;
872
897
  }
@@ -889,9 +914,6 @@ int main(int argc, char ** argv) {
889
914
 
890
915
  common_sampler_free(smpl);
891
916
 
892
- llama_free(ctx);
893
- llama_free_model(model);
894
-
895
917
  llama_backend_free();
896
918
 
897
919
  ggml_threadpool_free_fn(threadpool);