@fugood/llama.node 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +8 -0
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +156 -6
  25. package/src/LlamaContext.h +5 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -132,8 +132,10 @@ int main(int argc, char ** argv) {
132
132
  // load the target model
133
133
  common_init_result llama_init = common_init_from_params(params);
134
134
 
135
- llama_model * model = llama_init.model;
136
- llama_context * ctx = llama_init.context;
135
+ llama_model * model = llama_init.model.get();
136
+ llama_context * ctx = llama_init.context.get();
137
+
138
+ const llama_vocab * vocab = llama_model_get_vocab(model);
137
139
 
138
140
  // load the prompts from an external file if there are any
139
141
  if (params.prompt.empty()) {
@@ -358,7 +360,7 @@ int main(int argc, char ** argv) {
358
360
  // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
359
361
 
360
362
  if (client.n_decoded > 2 &&
361
- (llama_token_is_eog(model, id) ||
363
+ (llama_vocab_is_eog(vocab, id) ||
362
364
  (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
363
365
  client.response.find("User:") != std::string::npos ||
364
366
  client.response.find('\n') != std::string::npos)) {
@@ -416,9 +418,6 @@ int main(int argc, char ** argv) {
416
418
 
417
419
  llama_batch_free(batch);
418
420
 
419
- llama_free(ctx);
420
- llama_free_model(model);
421
-
422
421
  llama_backend_free();
423
422
 
424
423
  LOG("\n\n");
@@ -63,22 +63,24 @@ int main(int argc, char ** argv) {
63
63
 
64
64
  llama_model_params model_params = common_model_params_to_llama(params);
65
65
 
66
- llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
66
+ llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
67
67
 
68
68
  if (model == NULL) {
69
69
  LOG_ERR("%s: unable to load model\n" , __func__);
70
70
  return 1;
71
71
  }
72
72
 
73
+ const llama_vocab * vocab = llama_model_get_vocab(model);
74
+
73
75
  // initialize the context
74
76
 
75
77
  llama_context_params ctx_params = common_context_params_to_llama(params);
76
78
 
77
- ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
79
+ ctx_params.n_ctx = llama_model_n_ctx_train(model)*n_grp + n_keep;
78
80
 
79
81
  GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
80
82
 
81
- llama_context * ctx = llama_new_context_with_model(model, ctx_params);
83
+ llama_context * ctx = llama_init_from_model(model, ctx_params);
82
84
  if (ctx == NULL) {
83
85
  LOG_ERR("%s: failed to create the llama_context\n" , __func__);
84
86
  return 1;
@@ -223,7 +225,7 @@ int main(int argc, char ** argv) {
223
225
  const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
224
226
 
225
227
  // is it an end of generation?
226
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
228
+ if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
227
229
  LOG("\n");
228
230
 
229
231
  break;
@@ -266,7 +268,7 @@ int main(int argc, char ** argv) {
266
268
  llama_batch_free(batch);
267
269
 
268
270
  llama_free(ctx);
269
- llama_free_model(model);
271
+ llama_model_free(model);
270
272
 
271
273
  llama_backend_free();
272
274
 
@@ -296,8 +296,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
296
296
  // Output: `perplexity: 13.5106 [114/114]`
297
297
  // BOS tokens will be added for each chunk before eval
298
298
 
299
- const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
300
- GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
299
+ const llama_model * model = llama_get_model(ctx);
300
+ const llama_vocab * vocab = llama_model_get_vocab(model);
301
+
302
+ const bool add_bos = llama_vocab_get_add_bos(vocab);
303
+ GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
301
304
 
302
305
  LOG_INF("%s: tokenizing the input ..\n", __func__);
303
306
 
@@ -338,7 +341,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
338
341
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
339
342
  const int n_batch = params.n_batch;
340
343
 
341
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
344
+ const int n_vocab = llama_vocab_n_tokens(vocab);
342
345
 
343
346
  int count = 0;
344
347
  double nll = 0.0;
@@ -382,7 +385,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
382
385
 
383
386
  // add BOS token for the first batch of each chunk
384
387
  if (add_bos && j == 0) {
385
- tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
388
+ tokens[batch_start] = llama_vocab_bos(vocab);
386
389
  }
387
390
 
388
391
  const auto * batch_logits = llama_get_logits(ctx);
@@ -444,8 +447,11 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
444
447
  // Output: `perplexity: 13.5106 [114/114]`
445
448
  // BOS tokens will be added for each chunk before eval
446
449
 
447
- const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
448
- GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
450
+ const llama_model * model = llama_get_model(ctx);
451
+ const llama_vocab * vocab = llama_model_get_vocab(model);
452
+
453
+ const bool add_bos = llama_vocab_get_add_bos(vocab);
454
+ GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
449
455
 
450
456
  std::ofstream logits_stream;
451
457
  if (!params.logits_file.empty()) {
@@ -485,7 +491,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
485
491
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
486
492
  const int n_batch = params.n_batch;
487
493
 
488
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
494
+ const int n_vocab = llama_vocab_n_tokens(vocab);
489
495
 
490
496
  int count = 0;
491
497
  double nll = 0.0;
@@ -557,7 +563,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
557
563
 
558
564
  // add BOS token for the first batch of each chunk
559
565
  if (add_bos && j == 0) {
560
- tokens[seq_start] = llama_token_bos(llama_get_model(ctx));
566
+ tokens[seq_start] = llama_vocab_bos(vocab);
561
567
  }
562
568
 
563
569
  for (int k = 0; k < batch_size; ++k) {
@@ -732,6 +738,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
732
738
  }
733
739
 
734
740
  static void hellaswag_score(llama_context * ctx, const common_params & params) {
741
+ const llama_model * model = llama_get_model(ctx);
742
+ const llama_vocab * vocab = llama_model_get_vocab(model);
743
+
735
744
  // Calculates hellaswag score (acc_norm) from prompt
736
745
  //
737
746
  // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -765,7 +774,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
765
774
  size_t hs_task_count = prompt_lines.size()/6;
766
775
  LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
767
776
 
768
- const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
777
+ const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
769
778
  LOG_INF("================================= is_spm = %d\n", is_spm);
770
779
 
771
780
  // The tasks should be randomized so the score stabilizes quickly.
@@ -848,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
848
857
  const int n_ctx = llama_n_ctx(ctx);
849
858
  const int n_batch = params.n_batch;
850
859
 
851
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
860
+ const int n_vocab = llama_vocab_n_tokens(vocab);
852
861
 
853
862
  const int max_tasks_per_batch = 32;
854
863
  const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1072,6 +1081,8 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
1072
1081
  *
1073
1082
  */
1074
1083
  static void winogrande_score(llama_context * ctx, const common_params & params) {
1084
+ const llama_model * model = llama_get_model(ctx);
1085
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1075
1086
 
1076
1087
  constexpr int k_min_trailing_ctx = 3;
1077
1088
 
@@ -1130,7 +1141,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
1130
1141
  const int n_ctx = llama_n_ctx(ctx);
1131
1142
  const int n_batch = params.n_batch;
1132
1143
 
1133
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
1144
+ const int n_vocab = llama_vocab_n_tokens(vocab);
1134
1145
 
1135
1146
  const int max_tasks_per_batch = 128;
1136
1147
  const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1374,6 +1385,8 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
1374
1385
  // https://huggingface.co/datasets/truthful_qa
1375
1386
  //
1376
1387
  static void multiple_choice_score(llama_context * ctx, const common_params & params) {
1388
+ const llama_model * model = llama_get_model(ctx);
1389
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1377
1390
 
1378
1391
  std::istringstream strstream(params.prompt);
1379
1392
  uint32_t n_task;
@@ -1482,7 +1495,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
1482
1495
  const int n_ctx = llama_n_ctx(ctx);
1483
1496
  const int n_batch = params.n_batch;
1484
1497
 
1485
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
1498
+ const int n_vocab = llama_vocab_n_tokens(vocab);
1486
1499
 
1487
1500
  const int max_tasks_per_batch = 32;
1488
1501
  const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1655,6 +1668,9 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
1655
1668
  }
1656
1669
 
1657
1670
  static void kl_divergence(llama_context * ctx, const common_params & params) {
1671
+ const llama_model * model = llama_get_model(ctx);
1672
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1673
+
1658
1674
  if (params.logits_file.empty()) {
1659
1675
  LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
1660
1676
  return;
@@ -1688,8 +1704,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
1688
1704
  LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
1689
1705
  return;
1690
1706
  }
1691
- if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
1692
- LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
1707
+ if (n_vocab != llama_vocab_n_tokens(vocab)) {
1708
+ LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab));
1693
1709
  }
1694
1710
 
1695
1711
  std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
@@ -1701,8 +1717,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
1701
1717
  const int n_batch = params.n_batch;
1702
1718
  const int num_batches = (n_ctx + n_batch - 1)/n_batch;
1703
1719
  const int nv = 2*((n_vocab + 1)/2) + 4;
1704
- const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
1705
- GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
1720
+ const bool add_bos = llama_vocab_get_add_bos(vocab);
1721
+ GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
1706
1722
 
1707
1723
  std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
1708
1724
  std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -1761,7 +1777,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
1761
1777
 
1762
1778
  // add BOS token for the first batch of each chunk
1763
1779
  if (add_bos && j == 0) {
1764
- tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
1780
+ tokens[batch_start] = llama_vocab_bos(vocab);
1765
1781
  }
1766
1782
 
1767
1783
  common_batch_clear(batch);
@@ -1987,14 +2003,15 @@ int main(int argc, char ** argv) {
1987
2003
  // load the model and apply lora adapter, if any
1988
2004
  common_init_result llama_init = common_init_from_params(params);
1989
2005
 
1990
- llama_model * model = llama_init.model;
1991
- llama_context * ctx = llama_init.context;
2006
+ llama_model * model = llama_init.model.get();
2007
+ llama_context * ctx = llama_init.context.get();
2008
+
1992
2009
  if (model == NULL) {
1993
2010
  LOG_ERR("%s: unable to load model\n", __func__);
1994
2011
  return 1;
1995
2012
  }
1996
2013
 
1997
- const int n_ctx_train = llama_n_ctx_train(model);
2014
+ const int n_ctx_train = llama_model_n_ctx_train(model);
1998
2015
 
1999
2016
  if (params.n_ctx > n_ctx_train) {
2000
2017
  LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
@@ -2023,9 +2040,6 @@ int main(int argc, char ** argv) {
2023
2040
  LOG("\n");
2024
2041
  llama_perf_context_print(ctx);
2025
2042
 
2026
- llama_free(ctx);
2027
- llama_free_model(model);
2028
-
2029
2043
  llama_backend_free();
2030
2044
 
2031
2045
  return 0;
@@ -1,7 +1,7 @@
1
- #include "common.h"
2
1
  #include "ggml.h"
3
2
  #include "llama.h"
4
- #include "llama-impl.h"
3
+ #include "llama-context.h"
4
+ #include "common.h"
5
5
 
6
6
  #include <algorithm>
7
7
  #include <cassert>
@@ -9,11 +9,9 @@
9
9
  #include <cmath>
10
10
  #include <cstdio>
11
11
  #include <cstring>
12
- #include <map>
13
12
  #include <numeric>
14
13
  #include <regex>
15
14
  #include <string>
16
- #include <unordered_map>
17
15
  #include <vector>
18
16
  #include <thread>
19
17
  #include <mutex>
@@ -311,7 +309,7 @@ int main(int argc, char ** argv) {
311
309
  auto mparams = llama_model_default_params();
312
310
  mparams.use_mlock = false;
313
311
 
314
- model = llama_load_model_from_file(params.model.c_str(), mparams);
312
+ model = llama_model_load_from_file(params.model.c_str(), mparams);
315
313
 
316
314
  if (model == NULL) {
317
315
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
@@ -321,22 +319,22 @@ int main(int argc, char ** argv) {
321
319
  auto cparams = llama_context_default_params();
322
320
  cparams.n_ctx = 256;
323
321
 
324
- ctx = llama_new_context_with_model(model, cparams);
322
+ ctx = llama_init_from_model(model, cparams);
325
323
 
326
324
  if (ctx == NULL) {
327
325
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
328
- llama_free_model(model);
326
+ llama_model_free(model);
329
327
  return 1;
330
328
  }
331
329
  }
332
330
 
333
- const auto &tensors = llama_internal_get_tensor_map(ctx);
331
+ const auto & tensors = llama_internal_get_tensor_map(ctx);
334
332
 
335
333
  // check layer tensors
336
334
  int included_layers = 0;
337
335
  int64_t max_nelements = 0;
338
336
  bool is_f16 = false;
339
- for (const auto& kv_tensor : tensors) {
337
+ for (const auto & kv_tensor : tensors) {
340
338
  if (!layer_included(params, kv_tensor.first)) {
341
339
  continue;
342
340
  }
@@ -349,7 +347,7 @@ int main(int argc, char ** argv) {
349
347
  fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
350
348
  "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
351
349
  llama_free(ctx);
352
- llama_free_model(model);
350
+ llama_model_free(model);
353
351
  return 1;
354
352
  }
355
353
  included_layers++;
@@ -371,8 +369,8 @@ int main(int argc, char ** argv) {
371
369
  if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
372
370
  continue;
373
371
  }
374
- const auto * qfns = ggml_get_type_traits(type);
375
- const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
372
+ const auto * qfns = ggml_get_type_traits(type);
373
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
376
374
  if (qfns_cpu->from_float && qfns->to_float) {
377
375
  if (params.verbose) {
378
376
  printf("testing %s ...\n", ggml_type_name(type));
@@ -382,7 +380,7 @@ int main(int argc, char ** argv) {
382
380
 
383
381
  error_stats global_stats {};
384
382
 
385
- for (const auto& kv_tensor : tensors) {
383
+ for (const auto & kv_tensor : tensors) {
386
384
  if (!layer_included(params, kv_tensor.first)) {
387
385
  continue;
388
386
  }
@@ -411,7 +409,7 @@ int main(int argc, char ** argv) {
411
409
 
412
410
 
413
411
  llama_free(ctx);
414
- llama_free_model(model);
412
+ llama_model_free(model);
415
413
  // report timing
416
414
  {
417
415
  const int64_t t_main_end_us = ggml_time_us();
@@ -151,15 +151,17 @@ int main(int argc, char ** argv) {
151
151
  // load the model
152
152
  common_init_result llama_init = common_init_from_params(params);
153
153
 
154
- llama_model * model = llama_init.model;
155
- llama_context * ctx = llama_init.context;
154
+ llama_model * model = llama_init.model.get();
155
+ llama_context * ctx = llama_init.context.get();
156
156
 
157
157
  if (model == NULL) {
158
158
  LOG_ERR("%s: unable to load model\n", __func__);
159
159
  return 1;
160
160
  }
161
161
 
162
- const int n_ctx_train = llama_n_ctx_train(model);
162
+ const llama_vocab * vocab = llama_model_get_vocab(model);
163
+
164
+ const int n_ctx_train = llama_model_n_ctx_train(model);
163
165
  const int n_ctx = llama_n_ctx(ctx);
164
166
 
165
167
  const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@@ -192,8 +194,8 @@ int main(int argc, char ** argv) {
192
194
  return 1;
193
195
  }
194
196
  // add eos if not present
195
- if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
196
- inp.push_back(llama_token_eos(model));
197
+ if (llama_vocab_eos(vocab) >= 0 && (inp.empty() || inp.back() != llama_vocab_eos(vocab))) {
198
+ inp.push_back(llama_vocab_eos(vocab));
197
199
  }
198
200
  chunk.tokens = inp;
199
201
  }
@@ -215,7 +217,7 @@ int main(int argc, char ** argv) {
215
217
  struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
216
218
 
217
219
  // allocate output
218
- const int n_embd = llama_n_embd(model);
220
+ const int n_embd = llama_model_n_embd(model);
219
221
  std::vector<float> embeddings(n_chunks * n_embd, 0);
220
222
  float * emb = embeddings.data();
221
223
 
@@ -298,7 +300,5 @@ int main(int argc, char ** argv) {
298
300
 
299
301
  // clean up
300
302
  llama_batch_free(query_batch);
301
- llama_free(ctx);
302
- llama_free_model(model);
303
303
  llama_backend_free();
304
304
  }
@@ -12,6 +12,10 @@
12
12
  #include "ggml-vulkan.h"
13
13
  #endif
14
14
 
15
+ #ifdef GGML_USE_SYCL
16
+ #include "ggml-sycl.h"
17
+ #endif
18
+
15
19
  #include "ggml-rpc.h"
16
20
  #ifdef _WIN32
17
21
  # include <windows.h>
@@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
91
95
  if (!backend) {
92
96
  fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
93
97
  }
98
+ #elif GGML_USE_SYCL
99
+ fprintf(stderr, "%s: using SYCL backend\n", __func__);
100
+ backend = ggml_backend_sycl_init(0); // init device 0
101
+ if (!backend) {
102
+ fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
103
+ }
94
104
  #endif
95
105
 
96
106
  // if there aren't GPU Backends fallback to CPU backend
@@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
106
116
  ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
107
117
  #elif GGML_USE_VULKAN
108
118
  ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
119
+ #elif GGML_USE_SYCL
120
+ ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
109
121
  #else
110
122
  #ifdef _WIN32
111
123
  MEMORYSTATUSEX status;
@@ -1,5 +1,5 @@
1
1
  set(TARGET llama-run)
2
- add_executable(${TARGET} run.cpp)
2
+ add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_compile_features(${TARGET} PRIVATE cxx_std_17)