@fugood/llama.node 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +8 -0
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +156 -6
  25. package/src/LlamaContext.h +5 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -2,7 +2,7 @@
2
2
 
3
3
  #pragma once
4
4
 
5
- #include "llama.h"
5
+ #include "llama-cpp.h"
6
6
 
7
7
  #include <string>
8
8
  #include <vector>
@@ -24,13 +24,11 @@
24
24
 
25
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
26
 
27
- struct common_lora_adapter_info {
27
+ struct common_adapter_lora_info {
28
28
  std::string path;
29
29
  float scale;
30
- };
31
30
 
32
- struct common_lora_adapter_container : common_lora_adapter_info {
33
- struct llama_lora_adapter * adapter;
31
+ struct llama_adapter_lora * ptr;
34
32
  };
35
33
 
36
34
  using llama_tokens = std::vector<llama_token>;
@@ -105,6 +103,12 @@ enum dimre_method {
105
103
  DIMRE_METHOD_MEAN,
106
104
  };
107
105
 
106
+ enum common_conversation_mode {
107
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
108
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
109
+ COMMON_CONVERSATION_MODE_AUTO = 2,
110
+ };
111
+
108
112
  // sampling parameters
109
113
  struct common_params_sampling {
110
114
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -180,6 +184,8 @@ struct common_params_vocoder {
180
184
 
181
185
  std::string model = ""; // model path // NOLINT
182
186
  std::string model_url = ""; // model url to download // NOLINT
187
+
188
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
183
189
  };
184
190
 
185
191
  struct common_params {
@@ -242,14 +248,13 @@ struct common_params {
242
248
  std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
243
249
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
244
250
  std::string logits_file = ""; // file for saving *all* logits // NOLINT
245
- std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
246
251
 
247
252
  std::vector<std::string> in_files; // all input files
248
253
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
249
254
  std::vector<llama_model_kv_override> kv_overrides;
250
255
 
251
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
252
- std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
256
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
257
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
253
258
 
254
259
  std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
255
260
 
@@ -277,7 +282,6 @@ struct common_params {
277
282
  bool special = false; // enable special token output
278
283
  bool interactive = false; // interactive mode
279
284
  bool interactive_first = false; // wait for user input immediately
280
- bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
281
285
  bool prompt_cache_all = false; // save user input and generations to prompt cache
282
286
  bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
283
287
 
@@ -303,6 +307,8 @@ struct common_params {
303
307
  ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
304
308
  ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
305
309
 
310
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
311
+
306
312
  // multimodal models (see examples/llava)
307
313
  std::string mmproj = ""; // path to multimodal projector // NOLINT
308
314
  std::vector<std::string> image; // path to image file(s)
@@ -456,6 +462,11 @@ static bool string_starts_with(const std::string & str,
456
462
  return str.rfind(prefix, 0) == 0;
457
463
  }
458
464
 
465
+ static bool string_ends_with(const std::string & str,
466
+ const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
467
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
468
+ }
469
+
459
470
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
460
471
  void string_process_escapes(std::string & input);
461
472
 
@@ -478,10 +489,12 @@ std::string fs_get_cache_file(const std::string & filename);
478
489
  // Model utils
479
490
  //
480
491
 
492
+ // note: defines object's lifetime
481
493
  struct common_init_result {
482
- struct llama_model * model = nullptr;
483
- struct llama_context * context = nullptr;
484
- std::vector<common_lora_adapter_container> lora_adapters;
494
+ llama_model_ptr model;
495
+ llama_context_ptr context;
496
+
497
+ std::vector<llama_adapter_lora_ptr> lora;
485
498
  };
486
499
 
487
500
  struct common_init_result common_init_from_params(common_params & params);
@@ -501,9 +514,12 @@ struct llama_model * common_load_model_from_hf(
501
514
  const std::string & local_path,
502
515
  const std::string & hf_token,
503
516
  const struct llama_model_params & params);
517
+ std::pair<std::string, std::string> common_get_hf_file(
518
+ const std::string & hf_repo_with_tag,
519
+ const std::string & hf_token);
504
520
 
505
521
  // clear LoRA adapters from context, then apply new list of adapters
506
- void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
522
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
507
523
 
508
524
  //
509
525
  // Batch utils
@@ -541,7 +557,7 @@ std::vector<llama_token> common_tokenize(
541
557
  bool parse_special = false);
542
558
 
543
559
  std::vector<llama_token> common_tokenize(
544
- const struct llama_model * model,
560
+ const struct llama_vocab * vocab,
545
561
  const std::string & text,
546
562
  bool add_special,
547
563
  bool parse_special = false);
@@ -553,11 +569,21 @@ std::string common_token_to_piece(
553
569
  llama_token token,
554
570
  bool special = true);
555
571
 
572
+ std::string common_token_to_piece(
573
+ const struct llama_vocab * vocab,
574
+ llama_token token,
575
+ bool special = true);
576
+
556
577
  // detokenizes a vector of tokens into a string
557
578
  // should work similar to Python's `tokenizer.decode`
558
579
  // optionally renders special/control tokens
559
580
  std::string common_detokenize(
560
- llama_context * ctx,
581
+ const struct llama_context * ctx,
582
+ const std::vector<llama_token> & tokens,
583
+ bool special = true);
584
+
585
+ std::string common_detokenize(
586
+ const struct llama_vocab * vocab,
561
587
  const std::vector<llama_token> & tokens,
562
588
  bool special = true);
563
589
 
@@ -571,6 +597,9 @@ struct common_chat_msg {
571
597
  std::string content;
572
598
  };
573
599
 
600
+ // Get the built-in chat template for the model. Return empty string if not present.
601
+ std::string common_get_builtin_chat_template(const struct llama_model * model);
602
+
574
603
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
575
604
  bool common_chat_verify_template(const std::string & tmpl);
576
605
 
@@ -637,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
637
666
  // Split utils
638
667
  //
639
668
 
640
- static const char * const LLM_KV_SPLIT_NO = "split.no";
641
- static const char * const LLM_KV_SPLIT_COUNT = "split.count";
642
- static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
669
+ namespace {
670
+
671
+ const char * const LLM_KV_SPLIT_NO = "split.no";
672
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
673
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
674
+
675
+ }
@@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
65
65
  static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
66
66
  common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
67
67
  if (part_static_it == nc_static.end()) {
68
- return -1;
68
+ return LLAMA_TOKEN_NULL;
69
69
  }
70
70
  const common_ngram_cache_part part_static = part_static_it->second;
71
71
 
72
72
  int max_count_static = 0;
73
73
  int sum_count_static = 0;
74
- llama_token max_token = -1;
74
+ llama_token max_token = LLAMA_TOKEN_NULL;
75
75
 
76
76
  for (std::pair<llama_token, int> token_count_static : part_static) {
77
77
  const llama_token token = token_count_static.first;
@@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
85
85
  }
86
86
 
87
87
  if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
88
- return -1;
88
+ return LLAMA_TOKEN_NULL;
89
89
  }
90
90
  if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
91
- return -1;
91
+ return LLAMA_TOKEN_NULL;
92
92
  }
93
93
  return max_token;
94
94
  }
@@ -98,9 +98,9 @@ static llama_token try_draft(
98
98
  common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
99
99
  const int * min_sample_size, const int * min_percent) {
100
100
 
101
- llama_token drafted_token = -1;
101
+ llama_token drafted_token = LLAMA_TOKEN_NULL;
102
102
 
103
- for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
103
+ for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
104
104
  const common_ngram ngram_primary = ngrams_primary[i];
105
105
 
106
106
  common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
@@ -112,7 +112,7 @@ static llama_token try_draft(
112
112
  int max_count_primary = 0;
113
113
  int max_count_static = 0;
114
114
  int sum_count_primary = 0;
115
- llama_token max_token = -1;
115
+ llama_token max_token = LLAMA_TOKEN_NULL;
116
116
 
117
117
  for (std::pair<llama_token, int> token_count_primary : part_primary) {
118
118
  const llama_token token = token_count_primary.first;
@@ -154,7 +154,7 @@ void common_ngram_cache_draft(
154
154
  }
155
155
 
156
156
  while ((int) draft.size()-1 < n_draft) {
157
- llama_token drafted_token = -1;
157
+ llama_token drafted_token = LLAMA_TOKEN_NULL;
158
158
 
159
159
  const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
160
160
  common_ngram ngram_static;
@@ -177,17 +177,17 @@ void common_ngram_cache_draft(
177
177
  }
178
178
  ngrams_cd.push_back(ngram_cd);
179
179
  }
180
- if (drafted_token == -1) {
180
+ if (drafted_token == LLAMA_TOKEN_NULL) {
181
181
  drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
182
182
  }
183
- if (drafted_token == -1) {
183
+ if (drafted_token == LLAMA_TOKEN_NULL) {
184
184
  drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
185
185
  }
186
- if (drafted_token == -1) {
186
+ if (drafted_token == LLAMA_TOKEN_NULL) {
187
187
  drafted_token = try_draft(nc_static, ngram_static);
188
188
  }
189
189
 
190
- if (drafted_token == -1) {
190
+ if (drafted_token == LLAMA_TOKEN_NULL) {
191
191
  break;
192
192
  }
193
193
 
@@ -17,13 +17,13 @@ struct common_ngram {
17
17
 
18
18
  common_ngram() {
19
19
  for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
20
- tokens[i] = -1;
20
+ tokens[i] = LLAMA_TOKEN_NULL;
21
21
  }
22
22
  }
23
23
 
24
24
  common_ngram(const llama_token * input, const int ngram_size) {
25
25
  for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
26
- tokens[i] = i < ngram_size ? input[i] : -1;
26
+ tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
27
27
  }
28
28
  }
29
29
 
@@ -113,7 +113,10 @@ struct common_sampler {
113
113
  void set_logits(struct llama_context * ctx, int idx) {
114
114
  const auto * logits = llama_get_logits_ith(ctx, idx);
115
115
 
116
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
116
+ const llama_model * model = llama_get_model(ctx);
117
+ const llama_vocab * vocab = llama_model_get_vocab(model);
118
+
119
+ const int n_vocab = llama_vocab_n_tokens(vocab);
117
120
 
118
121
  cur.resize(n_vocab);
119
122
 
@@ -142,13 +145,15 @@ std::string common_params_sampling::print() const {
142
145
  }
143
146
 
144
147
  struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
148
+ const llama_vocab * vocab = llama_model_get_vocab(model);
149
+
145
150
  llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
146
151
 
147
152
  lparams.no_perf = params.no_perf;
148
153
 
149
154
  auto * result = new common_sampler {
150
155
  /* .params = */ params,
151
- /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
156
+ /* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
152
157
  /* .chain = */ llama_sampler_chain_init(lparams),
153
158
  /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
154
159
  /* .cur = */ {},
@@ -157,7 +162,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
157
162
 
158
163
  llama_sampler_chain_add(result->chain,
159
164
  llama_sampler_init_logit_bias(
160
- llama_n_vocab(model),
165
+ llama_vocab_n_tokens(vocab),
161
166
  params.logit_bias.size(),
162
167
  params.logit_bias.data()));
163
168
 
@@ -172,7 +177,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
172
177
  c_breakers.push_back(str.c_str());
173
178
  }
174
179
 
175
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
180
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
176
181
  }
177
182
  break;
178
183
  case COMMON_SAMPLER_TYPE_TOP_K:
@@ -194,7 +199,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
194
199
  llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
195
200
  break;
196
201
  case COMMON_SAMPLER_TYPE_INFILL:
197
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
202
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
198
203
  break;
199
204
  case COMMON_SAMPLER_TYPE_PENALTIES:
200
205
  llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
@@ -206,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
206
211
  llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
207
212
  } else if (params.mirostat == 1) {
208
213
  llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
209
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
214
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
210
215
  } else if (params.mirostat == 2) {
211
216
  llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
212
217
  llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
@@ -79,10 +79,13 @@ bool common_speculative_are_compatible(
79
79
  const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
80
80
  const struct llama_model * model_dft = llama_get_model(ctx_dft);
81
81
 
82
- const bool vocab_type_tgt = llama_vocab_type(model_tgt);
82
+ const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
83
+ const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
84
+
85
+ const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
83
86
  LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
84
87
 
85
- const bool vocab_type_dft = llama_vocab_type(model_dft);
88
+ const bool vocab_type_dft = llama_vocab_type(vocab_dft);
86
89
  LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
87
90
 
88
91
  if (vocab_type_tgt != vocab_type_dft) {
@@ -91,34 +94,34 @@ bool common_speculative_are_compatible(
91
94
  return false;
92
95
  }
93
96
 
94
- if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
95
- llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
96
- llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
97
- llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
98
- LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
99
- LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
100
- LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
97
+ if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
98
+ llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
99
+ llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
100
+ llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
101
+ LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
102
+ LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
103
+ LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
101
104
  return false;
102
105
  }
103
106
 
104
107
  {
105
- const int n_vocab_tgt = llama_n_vocab(model_tgt);
106
- const int n_vocab_dft = llama_n_vocab(model_dft);
108
+ const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
109
+ const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
107
110
 
108
111
  const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
109
112
 
110
113
  if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
111
114
  LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
112
115
  "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
113
- __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
116
+ __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
114
117
  return false;
115
118
  }
116
119
 
117
120
  for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
118
- const char * token_text_tgt = llama_token_get_text(model_tgt, i);
119
- const char * token_text_dft = llama_token_get_text(model_dft, i);
121
+ const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
122
+ const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
120
123
  if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
121
- LOG_ERR("%s: draft model vocab must match target model to use speculation but "
124
+ LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
122
125
  "token %d content differs - target '%s', draft '%s'\n", __func__, i,
123
126
  common_token_to_piece(ctx_tgt, i).c_str(),
124
127
  common_token_to_piece(ctx_dft, i).c_str());
@@ -127,6 +127,8 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
127
127
 
128
128
  This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
129
129
 
130
+ If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect.
131
+
130
132
  - Using `CMake`:
131
133
 
132
134
  ```bash
@@ -41,17 +41,19 @@ int main(int argc, char ** argv) {
41
41
 
42
42
  llama_model_params model_params = common_model_params_to_llama(params);
43
43
 
44
- llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
44
+ llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
45
45
 
46
46
  if (model == NULL) {
47
47
  LOG_ERR("%s: error: unable to load model\n" , __func__);
48
48
  return 1;
49
49
  }
50
50
 
51
+ const llama_vocab * vocab = llama_model_get_vocab(model);
52
+
51
53
  // tokenize the prompt
52
54
 
53
55
  std::vector<llama_token> tokens_list;
54
- tokens_list = common_tokenize(model, params.prompt, true);
56
+ tokens_list = common_tokenize(vocab, params.prompt, true);
55
57
 
56
58
  const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
57
59
 
@@ -62,7 +64,7 @@ int main(int argc, char ** argv) {
62
64
  ctx_params.n_ctx = n_kv_req;
63
65
  ctx_params.n_batch = std::max(n_predict, n_parallel);
64
66
 
65
- llama_context * ctx = llama_new_context_with_model(model, ctx_params);
67
+ llama_context * ctx = llama_init_from_model(model, ctx_params);
66
68
 
67
69
  auto sparams = llama_sampler_chain_default_params();
68
70
  sparams.no_perf = false;
@@ -120,8 +122,8 @@ int main(int argc, char ** argv) {
120
122
  }
121
123
 
122
124
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
123
- if (decoder_start_token_id == -1) {
124
- decoder_start_token_id = llama_token_bos(model);
125
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
126
+ decoder_start_token_id = llama_vocab_bos(vocab);
125
127
  }
126
128
 
127
129
  common_batch_clear(batch);
@@ -174,7 +176,7 @@ int main(int argc, char ** argv) {
174
176
  const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
175
177
 
176
178
  // is it an end of generation? -> mark the stream as finished
177
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
179
+ if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
178
180
  i_batch[i] = -1;
179
181
  LOG("\n");
180
182
  if (n_parallel > 1) {
@@ -236,7 +238,7 @@ int main(int argc, char ** argv) {
236
238
 
237
239
  llama_sampler_free(smpl);
238
240
  llama_free(ctx);
239
- llama_free_model(model);
241
+ llama_model_free(model);
240
242
 
241
243
  llama_backend_free();
242
244
 
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
38
38
 
39
39
  llama_model_params model_params = common_model_params_to_llama(params);
40
40
 
41
- llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
41
+ llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
42
42
 
43
43
  if (model == NULL) {
44
44
  fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -50,7 +50,7 @@ int main(int argc, char ** argv) {
50
50
  // ensure enough sequences are available
51
51
  ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
52
52
 
53
- llama_context * ctx = llama_new_context_with_model(model, ctx_params);
53
+ llama_context * ctx = llama_init_from_model(model, ctx_params);
54
54
 
55
55
  if (ctx == NULL) {
56
56
  fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
194
194
  llama_batch_free(batch);
195
195
 
196
196
  llama_free(ctx);
197
- llama_free_model(model);
197
+ llama_model_free(model);
198
198
 
199
199
  llama_backend_free();
200
200
 
@@ -1,4 +1,6 @@
1
1
  #include "ggml.h"
2
+ #include "gguf.h"
3
+
2
4
  #include "llama.h"
3
5
  #include "common.h"
4
6
  #include "log.h"
@@ -434,12 +436,12 @@ static void print_matrix(struct ggml_tensor * probs) {
434
436
  }
435
437
  }
436
438
 
437
- struct llama_file {
439
+ struct my_llama_file {
438
440
  // use FILE * so we don't have to re-open the file to mmap
439
441
  FILE * fp;
440
442
  size_t size;
441
443
 
442
- llama_file(const char * fname, const char * mode) {
444
+ my_llama_file(const char * fname, const char * mode) {
443
445
  fp = std::fopen(fname, mode);
444
446
  if (fp == NULL) {
445
447
  size = 0;
@@ -500,7 +502,7 @@ struct llama_file {
500
502
  return std::string(chars.data(), len);
501
503
  }
502
504
 
503
- ~llama_file() {
505
+ ~my_llama_file() {
504
506
  if (fp) {
505
507
  std::fclose(fp);
506
508
  }
@@ -508,7 +510,7 @@ struct llama_file {
508
510
  };
509
511
 
510
512
  static bool is_ggml_file(const char * filename) {
511
- llama_file file(filename, "rb");
513
+ my_llama_file file(filename, "rb");
512
514
  if (file.size < 4) {
513
515
  return false;
514
516
  }
@@ -576,7 +578,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
576
578
  } else {
577
579
  // assume llama2.c vocabulary
578
580
  LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
579
- llama_file file(filename, "rb");
581
+ my_llama_file file(filename, "rb");
580
582
  if (!file.fp) {
581
583
  die_fmt("%s: %s", strerror(errno), filename);
582
584
  }
@@ -689,8 +691,8 @@ static void save_as_llama_model(
689
691
  gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
690
692
  gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
691
693
  gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
692
- gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
693
- gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
694
+ gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
695
+ gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
694
696
 
695
697
  gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
696
698
  gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
@@ -909,7 +911,7 @@ int main(int argc, char ** argv) {
909
911
  load_vocab(params.fn_vocab_model, &config, &vocab);
910
912
 
911
913
  struct my_llama_model model;
912
- model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
914
+ model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx);
913
915
  model.hparams.n_ctx = params.n_ctx;
914
916
  model.hparams.n_embd = config.dim; //params.n_embd;
915
917
  model.hparams.n_ff = config.hidden_dim;
@@ -1,7 +1,9 @@
1
+ #include "ggml.h"
2
+ #include "gguf.h"
3
+
1
4
  #include "arg.h"
2
5
  #include "common.h"
3
6
  #include "llama.h"
4
- #include "ggml.h"
5
7
  #include "pca.hpp"
6
8
  #include "mean.hpp"
7
9
 
@@ -271,7 +273,9 @@ struct tokenized_prompt {
271
273
  size_t max_seq_len;
272
274
 
273
275
  tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
274
- const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
276
+ const llama_model * model = llama_get_model(ctx);
277
+ const llama_vocab * vocab = llama_model_get_vocab(model);
278
+ const bool add_bos = llama_vocab_get_add_bos(vocab);
275
279
  tokens_pos = common_tokenize(ctx, pos, add_bos, true);
276
280
  tokens_neg = common_tokenize(ctx, neg, add_bos, true);
277
281
  max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@@ -415,12 +419,13 @@ int main(int argc, char ** argv) {
415
419
  // load the model to get hparams
416
420
  common_init_result llama_init = common_init_from_params(params);
417
421
 
418
- llama_model * model = llama_init.model;
419
- llama_context * ctx = llama_init.context;
422
+ llama_model * model = llama_init.model.get();
423
+ llama_context * ctx = llama_init.context.get();
420
424
 
421
425
  // int n_ctx = llama_n_ctx(ctx);
422
- int n_layers = llama_n_layer(model);
423
- int n_embd = llama_n_embd(model);
426
+ int n_layers = llama_model_n_layer(model);
427
+ int n_embd = llama_model_n_embd(model);
428
+
424
429
  // get model hint param (a.k.a model arch name)
425
430
  char model_hint[128];
426
431
  llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
@@ -474,8 +479,6 @@ int main(int argc, char ** argv) {
474
479
 
475
480
  // done with the model, we can now free it to make gain some memory
476
481
  printf("Done evaluate prompts, unload model...\n");
477
- llama_free(ctx);
478
- llama_free_model(model);
479
482
 
480
483
  bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
481
484
 
@@ -15,7 +15,7 @@ static void run(
15
15
  for (size_t il = 0; il < v_input.size(); ++il) {
16
16
  // prepare output vector
17
17
  struct ggml_tensor * ctrl_out = v_output[il];
18
- ggml_format_name(ctrl_out, "direction.%ld", il+1);
18
+ ggml_format_name(ctrl_out, "direction.%zu", il+1);
19
19
 
20
20
  // calculate mean vector
21
21
  struct ggml_tensor * t_layer = v_input[il];
@@ -302,7 +302,7 @@ static void run_pca(
302
302
 
303
303
  // prepare output vector
304
304
  struct ggml_tensor * ctrl_out = v_output[il];
305
- ggml_format_name(ctrl_out, "direction.%ld", il+1);
305
+ ggml_format_name(ctrl_out, "direction.%zu", il+1);
306
306
 
307
307
  // run power_iteration
308
308
  params.i_layer = il;