@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -34,7 +34,7 @@ int main(int argc, char ** argv) {
34
34
  llama_numa_init(params.numa);
35
35
 
36
36
  llama_model * model_tgt = NULL;
37
- llama_model * model_dft = NULL;
37
+ //llama_model * model_dft = NULL;
38
38
 
39
39
  llama_context * ctx_tgt = NULL;
40
40
  llama_context * ctx_dft = NULL;
@@ -42,8 +42,10 @@ int main(int argc, char ** argv) {
42
42
  // load the target model
43
43
  common_init_result llama_init_tgt = common_init_from_params(params);
44
44
 
45
- model_tgt = llama_init_tgt.model;
46
- ctx_tgt = llama_init_tgt.context;
45
+ model_tgt = llama_init_tgt.model.get();
46
+ ctx_tgt = llama_init_tgt.context.get();
47
+
48
+ const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
47
49
 
48
50
  // load the draft model
49
51
  params.devices = params.speculative.devices;
@@ -59,8 +61,8 @@ int main(int argc, char ** argv) {
59
61
  params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
60
62
  common_init_result llama_init_dft = common_init_from_params(params);
61
63
 
62
- model_dft = llama_init_dft.model;
63
- ctx_dft = llama_init_dft.context;
64
+ //model_dft = llama_init_dft.model.get();
65
+ ctx_dft = llama_init_dft.context.get();
64
66
 
65
67
  if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
66
68
  return 1;
@@ -196,7 +198,7 @@ int main(int argc, char ** argv) {
196
198
 
197
199
  id_last = ids[i];
198
200
 
199
- if (llama_token_is_eog(model_tgt, id_last)) {
201
+ if (llama_vocab_is_eog(vocab, id_last)) {
200
202
  has_eos = true;
201
203
  break;
202
204
  }
@@ -251,12 +253,6 @@ int main(int argc, char ** argv) {
251
253
  common_sampler_free(smpl);
252
254
  common_speculative_free(spec);
253
255
 
254
- llama_free(ctx_tgt);
255
- llama_free_model(model_tgt);
256
-
257
- llama_free(ctx_dft);
258
- llama_free_model(model_dft);
259
-
260
256
  llama_backend_free();
261
257
 
262
258
  LOG("\n\n");
@@ -31,6 +31,7 @@ static void print_usage_information(const char * argv0) {
31
31
  printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
32
32
  printf(" --stdin read prompt from standard input.\n");
33
33
  printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
34
+ printf(" --no-escape do not escape input (such as \\n, \\t, etc.).\n");
34
35
  printf(" --no-parse-special do not parse control tokens.\n");
35
36
  printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
36
37
  printf(" --show-count print the total number of tokens.\n");
@@ -198,6 +199,7 @@ int main(int raw_argc, char ** raw_argv) {
198
199
  // variables where to put any arguments we see.
199
200
  bool printing_ids = false;
200
201
  bool no_bos = false;
202
+ bool no_escape = false;
201
203
  bool no_parse_special = false;
202
204
  bool disable_logging = false;
203
205
  bool show_token_count = false;
@@ -233,6 +235,9 @@ int main(int raw_argc, char ** raw_argv) {
233
235
  else if (arg == "--no-bos") {
234
236
  no_bos = true;
235
237
  }
238
+ else if (arg == "--no-escape") {
239
+ no_escape = true;
240
+ }
236
241
  else if (arg == "--no-parse-special") {
237
242
  no_parse_special = true;
238
243
  }
@@ -333,14 +338,16 @@ int main(int raw_argc, char ** raw_argv) {
333
338
 
334
339
  llama_model_params model_params = llama_model_default_params();
335
340
  model_params.vocab_only = true;
336
- llama_model * model = llama_load_model_from_file(model_path, model_params);
341
+ llama_model * model = llama_model_load_from_file(model_path, model_params);
337
342
  if (!model) {
338
343
  fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
339
344
  return 1;
340
345
  }
341
346
 
347
+ const llama_vocab * vocab = llama_model_get_vocab(model);
348
+
342
349
  llama_context_params ctx_params = llama_context_default_params();
343
- llama_context * ctx = llama_new_context_with_model(model, ctx_params);
350
+ llama_context * ctx = llama_init_from_model(model, ctx_params);
344
351
  if (!ctx) {
345
352
  fprintf(stderr, "Error: could not create context.\n");
346
353
  return 1;
@@ -360,12 +367,17 @@ int main(int raw_argc, char ** raw_argv) {
360
367
  prompt = stdin_buffer.str();
361
368
  }
362
369
 
363
- const bool model_wants_add_bos = llama_add_bos_token(model);
370
+ const bool model_wants_add_bos = llama_vocab_get_add_bos(vocab);
364
371
  const bool add_bos = model_wants_add_bos && !no_bos;
365
372
  const bool parse_special = !no_parse_special;
373
+ const bool escape = !no_escape;
374
+
375
+ if (escape) {
376
+ string_process_escapes(prompt);
377
+ }
366
378
 
367
379
  std::vector<llama_token> tokens;
368
- tokens = common_tokenize(model, prompt, add_bos, parse_special);
380
+ tokens = common_tokenize(vocab, prompt, add_bos, parse_special);
369
381
 
370
382
  if (printing_ids) {
371
383
  printf("[");
@@ -398,7 +410,7 @@ int main(int raw_argc, char ** raw_argv) {
398
410
  }
399
411
  // silence valgrind
400
412
  llama_free(ctx);
401
- llama_free_model(model);
413
+ llama_model_free(model);
402
414
 
403
415
  return 0;
404
416
  }
@@ -414,15 +414,42 @@ static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
414
414
  prompt.insert(prompt.end(), tokens.begin(), tokens.end());
415
415
  }
416
416
 
417
- static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
418
- auto tmp = common_tokenize(model, txt, add_special, parse_special);
417
+ static void prompt_add(llama_tokens & prompt, const llama_vocab * vocab, const std::string & txt, bool add_special, bool parse_special) {
418
+ auto tmp = common_tokenize(vocab, txt, add_special, parse_special);
419
419
  prompt_add(prompt, tmp);
420
420
  }
421
421
 
422
- static void prompt_init(llama_tokens & prompt, const llama_model * model) {
422
+ static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
423
423
  prompt.clear();
424
424
 
425
- prompt_add(prompt, model, "<|im_start|>\n", true, true);
425
+ prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
426
+ }
427
+
428
+ static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str) {
429
+ const std::string& delimiter = "<|text_sep|>";
430
+
431
+ std::vector<llama_token> result;
432
+ size_t start = 0;
433
+ size_t end = str.find(delimiter);
434
+
435
+ //first token is always a newline, as it was not previously added
436
+ result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
437
+
438
+ while (end != std::string::npos) {
439
+ std::string current_word = str.substr(start, end - start);
440
+ auto tmp = common_tokenize(vocab, current_word, false, true);
441
+ result.push_back(tmp[0]);
442
+ start = end + delimiter.length();
443
+ end = str.find(delimiter, start);
444
+ }
445
+
446
+ // Add the last part
447
+ std::string current_word = str.substr(start);
448
+ auto tmp = common_tokenize(vocab, current_word, false, true);
449
+ if (tmp.size() > 0) {
450
+ result.push_back(tmp[0]);
451
+ }
452
+ return result;
426
453
  }
427
454
 
428
455
  int main(int argc, char ** argv) {
@@ -458,8 +485,11 @@ int main(int argc, char ** argv) {
458
485
  llama_context * ctx_cts = NULL;
459
486
 
460
487
  common_init_result llama_init_ttc = common_init_from_params(params);
461
- model_ttc = llama_init_ttc.model;
462
- ctx_ttc = llama_init_ttc.context;
488
+
489
+ model_ttc = llama_init_ttc.model.get();
490
+ ctx_ttc = llama_init_ttc.context.get();
491
+
492
+ const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
463
493
 
464
494
  // TODO: refactor in a common struct
465
495
  params.model = params.vocoder.model;
@@ -470,8 +500,9 @@ int main(int argc, char ** argv) {
470
500
  params.embedding = true;
471
501
 
472
502
  common_init_result llama_init_cts = common_init_from_params(params);
473
- model_cts = llama_init_cts.model;
474
- ctx_cts = llama_init_cts.context;
503
+
504
+ model_cts = llama_init_cts.model.get();
505
+ ctx_cts = llama_init_cts.context.get();
475
506
 
476
507
  std::vector<common_sampler *> smpl(n_parallel);
477
508
  for (int i = 0; i < n_parallel; ++i) {
@@ -490,6 +521,7 @@ int main(int argc, char ** argv) {
490
521
  const auto t_main_start = ggml_time_us();
491
522
 
492
523
  std::vector<llama_token> codes;
524
+ std::vector<llama_token> guide_tokens;
493
525
 
494
526
  // process prompt and generate voice codes
495
527
  {
@@ -497,20 +529,23 @@ int main(int argc, char ** argv) {
497
529
 
498
530
  std::vector<llama_token> prompt_inp;
499
531
 
500
- prompt_init(prompt_inp, model_ttc);
532
+ prompt_init(prompt_inp, vocab);
501
533
 
502
- prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
534
+ prompt_add(prompt_inp, vocab, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
503
535
 
504
536
  // convert the input text into the necessary format expected by OuteTTS
505
537
  {
506
538
  std::string prompt_clean = process_text(params.prompt);
539
+ if (params.vocoder.use_guide_tokens) {
540
+ guide_tokens = prepare_guide_tokens(vocab, prompt_clean);
541
+ }
507
542
 
508
543
  LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
509
544
 
510
- prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
545
+ prompt_add(prompt_inp, vocab, prompt_clean, false, true);
511
546
  }
512
547
 
513
- prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
548
+ prompt_add(prompt_inp, vocab, "<|text_end|>\n", false, true);
514
549
 
515
550
  // disabled to save time on tokenizing each time
516
551
  // TODO: load voices from the json files
@@ -547,7 +582,7 @@ it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><
547
582
  looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
548
583
  lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
549
584
 
550
- auto tmp = common_tokenize(model_ttc, voice_data, false, true);
585
+ auto tmp = common_tokenize(vocab, voice_data, false, true);
551
586
  printf("\n\n");
552
587
  for (int i = 0; i < tmp.size(); ++i) {
553
588
  printf("%d, ", tmp[i]);
@@ -713,6 +748,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
713
748
  int n_past = batch.n_tokens;
714
749
  int n_decode = 0;
715
750
 
751
+ bool next_token_uses_guide_token = true;
752
+
716
753
  while (n_decode <= n_predict) {
717
754
  // prepare the next batch
718
755
  common_batch_clear(batch);
@@ -724,7 +761,17 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
724
761
  continue;
725
762
  }
726
763
 
727
- const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
764
+ llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
765
+
766
+ //guide tokens help prevent hallucinations by forcing the TTS to use the correct word
767
+ if (!guide_tokens.empty() && next_token_uses_guide_token && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
768
+ llama_token guide_token = guide_tokens[0];
769
+ guide_tokens.erase(guide_tokens.begin());
770
+ new_token_id = guide_token; //ensure correct word fragment is used
771
+ }
772
+
773
+ //this is the token id that always precedes a new word
774
+ next_token_uses_guide_token = (new_token_id == 198);
728
775
 
729
776
  common_sampler_accept(smpl[i], new_token_id, true);
730
777
 
@@ -733,9 +780,9 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
733
780
  const auto * cands = common_sampler_get_candidates(smpl[i]);
734
781
 
735
782
  // is it an end of generation? -> mark the stream as finished
736
- if (llama_token_is_eog(model_ttc, new_token_id) || n_decode == n_predict) {
783
+ if (llama_vocab_is_eog(vocab, new_token_id) || n_decode == n_predict) {
737
784
  std::string reason;
738
- if (llama_token_is_eog(model_ttc, new_token_id)) {
785
+ if (llama_vocab_is_eog(vocab, new_token_id)) {
739
786
  reason = "eos";
740
787
  } else {
741
788
  reason = "n_predict";
@@ -871,7 +918,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
871
918
 
872
919
  #if 1
873
920
  // spectral operations
874
- const int n_embd = llama_n_embd(model_cts);
921
+ const int n_embd = llama_model_n_embd(model_cts);
875
922
  const float * embd = llama_get_embeddings(ctx_cts);
876
923
 
877
924
  auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
@@ -920,12 +967,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
920
967
 
921
968
  LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
922
969
 
923
- llama_free(ctx_ttc);
924
- llama_free_model(model_ttc);
925
-
926
- llama_free(ctx_cts);
927
- llama_free_model(model_cts);
928
-
929
970
  llama_backend_free();
930
971
 
931
972
  return 0;
@@ -185,6 +185,9 @@ option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increas
185
185
  option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
186
186
  option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
187
187
 
188
+ # toolchain for vulkan-shaders-gen
189
+ set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
190
+
188
191
  # extra artifacts
189
192
  option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
190
193
  option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
@@ -243,7 +246,8 @@ set(GGML_PUBLIC_HEADERS
243
246
  include/ggml-metal.h
244
247
  include/ggml-rpc.h
245
248
  include/ggml-sycl.h
246
- include/ggml-vulkan.h)
249
+ include/ggml-vulkan.h
250
+ include/gguf.h)
247
251
 
248
252
  set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
249
253
  #if (GGML_METAL)
@@ -252,26 +256,6 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
252
256
  install(TARGETS ggml LIBRARY PUBLIC_HEADER)
253
257
  install(TARGETS ggml-base LIBRARY)
254
258
 
255
- # FIXME: this should be done in the backend cmake files
256
- if (GGML_METAL)
257
- # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
258
- install(
259
- FILES src/ggml-metal/ggml-metal.metal
260
- PERMISSIONS
261
- OWNER_READ
262
- OWNER_WRITE
263
- GROUP_READ
264
- WORLD_READ
265
- DESTINATION ${CMAKE_INSTALL_BINDIR})
266
-
267
- if (NOT GGML_METAL_EMBED_LIBRARY)
268
- install(
269
- FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
270
- DESTINATION ${CMAKE_INSTALL_BINDIR}
271
- )
272
- endif()
273
- endif()
274
-
275
259
  if (GGML_STANDALONE)
276
260
  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
277
261
  ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
@@ -203,6 +203,8 @@ extern "C" {
203
203
  // Backend registry
204
204
  //
205
205
 
206
+ GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
207
+
206
208
  // Backend (reg) enumeration
207
209
  GGML_API size_t ggml_backend_reg_count(void);
208
210
  GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
@@ -7,6 +7,7 @@
7
7
  #include "ggml.h"
8
8
  #include "ggml-alloc.h"
9
9
  #include "ggml-backend.h"
10
+ #include "gguf.h"
10
11
  #include <memory>
11
12
 
12
13
  // Smart pointers for ggml types
@@ -241,12 +241,6 @@
241
241
  #define GGML_ROPE_TYPE_MROPE 8
242
242
  #define GGML_ROPE_TYPE_VISION 24
243
243
 
244
- #define GGUF_MAGIC "GGUF"
245
-
246
- #define GGUF_VERSION 3
247
-
248
- #define GGUF_DEFAULT_ALIGNMENT 32
249
-
250
244
  #define GGML_UNUSED(x) (void)(x)
251
245
 
252
246
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -403,12 +397,6 @@ extern "C" {
403
397
  GGML_PREC_F32,
404
398
  };
405
399
 
406
- enum ggml_backend_type {
407
- GGML_BACKEND_TYPE_CPU = 0,
408
- GGML_BACKEND_TYPE_GPU = 10,
409
- GGML_BACKEND_TYPE_GPU_SPLIT = 20,
410
- };
411
-
412
400
  // model file types
413
401
  enum ggml_ftype {
414
402
  GGML_FTYPE_UNKNOWN = -1,
@@ -513,6 +501,7 @@ extern "C" {
513
501
  GGML_OP_GET_REL_POS,
514
502
  GGML_OP_ADD_REL_POS,
515
503
  GGML_OP_RWKV_WKV6,
504
+ GGML_OP_GATED_LINEAR_ATTN,
516
505
 
517
506
  GGML_OP_UNARY,
518
507
 
@@ -587,8 +576,6 @@ extern "C" {
587
576
  struct ggml_tensor {
588
577
  enum ggml_type type;
589
578
 
590
- GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
591
-
592
579
  struct ggml_backend_buffer * buffer;
593
580
 
594
581
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -1397,16 +1384,20 @@ extern "C" {
1397
1384
  float scale,
1398
1385
  float max_bias);
1399
1386
 
1400
- GGML_API struct ggml_tensor * ggml_soft_max_back(
1387
+ GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
1401
1388
  struct ggml_context * ctx,
1402
1389
  struct ggml_tensor * a,
1403
- struct ggml_tensor * b);
1390
+ struct ggml_tensor * b,
1391
+ float scale,
1392
+ float max_bias);
1404
1393
 
1405
1394
  // in-place, returns view(a)
1406
- GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
1395
+ GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
1407
1396
  struct ggml_context * ctx,
1408
1397
  struct ggml_tensor * a,
1409
- struct ggml_tensor * b);
1398
+ struct ggml_tensor * b,
1399
+ float scale,
1400
+ float max_bias);
1410
1401
 
1411
1402
  // rotary position embedding
1412
1403
  // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
@@ -1513,7 +1504,7 @@ extern "C" {
1513
1504
 
1514
1505
  // rotary position embedding backward, i.e compute dx from dy
1515
1506
  // a - dy
1516
- GGML_API struct ggml_tensor * ggml_rope_back(
1507
+ GGML_API struct ggml_tensor * ggml_rope_ext_back(
1517
1508
  struct ggml_context * ctx,
1518
1509
  struct ggml_tensor * a, // gradients of ggml_rope result
1519
1510
  struct ggml_tensor * b, // positions
@@ -1528,6 +1519,23 @@ extern "C" {
1528
1519
  float beta_fast,
1529
1520
  float beta_slow);
1530
1521
 
1522
+ GGML_API struct ggml_tensor * ggml_rope_multi_back(
1523
+ struct ggml_context * ctx,
1524
+ struct ggml_tensor * a,
1525
+ struct ggml_tensor * b,
1526
+ struct ggml_tensor * c,
1527
+ int n_dims,
1528
+ int sections[4],
1529
+ int mode,
1530
+ int n_ctx_orig,
1531
+ float freq_base,
1532
+ float freq_scale,
1533
+ float ext_factor,
1534
+ float attn_factor,
1535
+ float beta_fast,
1536
+ float beta_slow);
1537
+
1538
+
1531
1539
  // clamp
1532
1540
  // in-place, returns view(a)
1533
1541
  GGML_API struct ggml_tensor * ggml_clamp(
@@ -1873,6 +1881,15 @@ extern "C" {
1873
1881
  struct ggml_tensor * td,
1874
1882
  struct ggml_tensor * state);
1875
1883
 
1884
+ GGML_API struct ggml_tensor * ggml_gated_linear_attn(
1885
+ struct ggml_context * ctx,
1886
+ struct ggml_tensor * k,
1887
+ struct ggml_tensor * v,
1888
+ struct ggml_tensor * q,
1889
+ struct ggml_tensor * g,
1890
+ struct ggml_tensor * state,
1891
+ float scale);
1892
+
1876
1893
  // custom operators
1877
1894
 
1878
1895
  typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -2111,132 +2128,6 @@ extern "C" {
2111
2128
  int64_t n_per_row,
2112
2129
  const float * imatrix);
2113
2130
 
2114
- //
2115
- // gguf
2116
- //
2117
-
2118
- enum gguf_type {
2119
- GGUF_TYPE_UINT8 = 0,
2120
- GGUF_TYPE_INT8 = 1,
2121
- GGUF_TYPE_UINT16 = 2,
2122
- GGUF_TYPE_INT16 = 3,
2123
- GGUF_TYPE_UINT32 = 4,
2124
- GGUF_TYPE_INT32 = 5,
2125
- GGUF_TYPE_FLOAT32 = 6,
2126
- GGUF_TYPE_BOOL = 7,
2127
- GGUF_TYPE_STRING = 8,
2128
- GGUF_TYPE_ARRAY = 9,
2129
- GGUF_TYPE_UINT64 = 10,
2130
- GGUF_TYPE_INT64 = 11,
2131
- GGUF_TYPE_FLOAT64 = 12,
2132
- GGUF_TYPE_COUNT, // marks the end of the enum
2133
- };
2134
-
2135
- struct gguf_context;
2136
-
2137
- struct gguf_init_params {
2138
- bool no_alloc;
2139
-
2140
- // if not NULL, create a ggml_context and allocate the tensor data in it
2141
- struct ggml_context ** ctx;
2142
- };
2143
-
2144
- GGML_API struct gguf_context * gguf_init_empty(void);
2145
- GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
2146
- //GGML_API struct gguf_context * gguf_init_from_buffer(..);
2147
-
2148
- GGML_API void gguf_free(struct gguf_context * ctx);
2149
-
2150
- GGML_API const char * gguf_type_name(enum gguf_type type);
2151
-
2152
- GGML_API int gguf_get_version (const struct gguf_context * ctx);
2153
- GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
2154
- GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
2155
- GGML_API void * gguf_get_data (const struct gguf_context * ctx);
2156
-
2157
- GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
2158
- GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
2159
- GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
2160
-
2161
- GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
2162
- GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
2163
-
2164
- // will abort if the wrong type is used for the key
2165
- GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
2166
- GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
2167
- GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
2168
- GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
2169
- GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
2170
- GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
2171
- GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
2172
- GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
2173
- GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
2174
- GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
2175
- GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
2176
- GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
2177
- GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
2178
- GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
2179
- GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
2180
- GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
2181
-
2182
- GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2183
- GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2184
- GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2185
- GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2186
- GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2187
-
2188
- // removes key if it exists
2189
- GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2190
-
2191
- // overrides existing values or adds a new one
2192
- GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2193
- GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
2194
- GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
2195
- GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
2196
- GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
2197
- GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
2198
- GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
2199
- GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
2200
- GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
2201
- GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
2202
- GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
2203
- GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
2204
- GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
2205
- GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
2206
-
2207
- // set or add KV pairs from another context
2208
- GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
2209
-
2210
- // manage tensor info
2211
- GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
2212
- GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
2213
- GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
2214
-
2215
- // writing gguf files can be done in 2 ways:
2216
- //
2217
- // - write the entire gguf_context to a binary file in a single pass:
2218
- //
2219
- // gguf_write_to_file(ctx, fname);
2220
- //
2221
- // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
2222
- //
2223
- // FILE * f = fopen(fname, "wb");
2224
- // fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
2225
- // fwrite(f, ...);
2226
- // void * data = gguf_meta_get_meta_data(ctx);
2227
- // fseek(f, 0, SEEK_SET);
2228
- // fwrite(f, data, gguf_get_meta_size(ctx));
2229
- // free(data);
2230
- // fclose(f);
2231
- //
2232
-
2233
- // write the entire context to a binary file
2234
- GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
2235
-
2236
- // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
2237
- GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2238
- GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
2239
-
2240
2131
  #ifdef __cplusplus
2241
2132
  // restrict not standard in C++
2242
2133
  # if defined(__GNUC__)