@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -97,14 +97,17 @@ int main(int argc, char ** argv) {
97
97
  // load the model
98
98
  common_init_result llama_init = common_init_from_params(params);
99
99
 
100
- llama_model * model = llama_init.model;
101
- llama_context * ctx = llama_init.context;
100
+ llama_model * model = llama_init.model.get();
101
+ llama_context * ctx = llama_init.context.get();
102
+
102
103
  if (model == NULL) {
103
104
  LOG_ERR("%s: unable to load model\n", __func__);
104
105
  return 1;
105
106
  }
106
107
 
107
- const int n_ctx_train = llama_n_ctx_train(model);
108
+ const llama_vocab * vocab = llama_model_get_vocab(model);
109
+
110
+ const int n_ctx_train = llama_model_n_ctx_train(model);
108
111
  const int n_ctx = llama_n_ctx(ctx);
109
112
 
110
113
  const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@@ -147,7 +150,7 @@ int main(int argc, char ** argv) {
147
150
  // check if the last token is SEP
148
151
  // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
149
152
  for (auto & inp : inputs) {
150
- if (inp.empty() || inp.back() != llama_token_sep(model)) {
153
+ if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
151
154
  LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
152
155
  LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
153
156
  }
@@ -180,7 +183,7 @@ int main(int argc, char ** argv) {
180
183
  }
181
184
 
182
185
  // allocate output
183
- const int n_embd = llama_n_embd(model);
186
+ const int n_embd = llama_model_n_embd(model);
184
187
  std::vector<float> embeddings(n_embd_count * n_embd, 0);
185
188
  float * emb = embeddings.data();
186
189
 
@@ -316,8 +319,6 @@ int main(int argc, char ** argv) {
316
319
 
317
320
  // clean up
318
321
  llama_batch_free(batch);
319
- llama_free(ctx);
320
- llama_free_model(model);
321
322
  llama_backend_free();
322
323
 
323
324
  return 0;
@@ -127,7 +127,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
127
127
  }
128
128
 
129
129
  static bool run(llama_context * ctx, const common_params & params) {
130
- const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
130
+ const llama_model * model = llama_get_model(ctx);
131
+ const llama_vocab * vocab = llama_model_get_vocab(model);
132
+
133
+ const bool add_bos = llama_vocab_get_add_bos(vocab);
131
134
 
132
135
  std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
133
136
 
@@ -162,8 +165,9 @@ int main(int argc, char ** argv) {
162
165
  // init
163
166
  common_init_result llama_init = common_init_from_params(params);
164
167
 
165
- llama_model * model = llama_init.model;
166
- llama_context * ctx = llama_init.context;
168
+ llama_model * model = llama_init.model.get();
169
+ llama_context * ctx = llama_init.context.get();
170
+
167
171
  if (model == nullptr || ctx == nullptr) {
168
172
  LOG_ERR("%s : failed to init\n", __func__);
169
173
  return 1;
@@ -184,9 +188,6 @@ int main(int argc, char ** argv) {
184
188
  LOG("\n");
185
189
  llama_perf_context_print(ctx);
186
190
 
187
- llama_free(ctx);
188
- llama_free_model(model);
189
-
190
191
  llama_backend_free();
191
192
 
192
193
  return 0;
@@ -1,12 +1,13 @@
1
- #include "arg.h"
2
- #include "common.h"
3
1
  #include "ggml.h"
4
2
  #include "ggml-alloc.h"
3
+ #include "gguf.h"
4
+
5
+ #include "arg.h"
6
+ #include "common.h"
5
7
 
6
8
  #include <map>
7
9
  #include <vector>
8
10
  #include <string>
9
- #include <thread>
10
11
  #include <fstream>
11
12
 
12
13
  static bool g_verbose = false;
@@ -128,7 +129,7 @@ struct lora_merge_ctx {
128
129
 
129
130
  lora_merge_ctx(
130
131
  std::string & base_fname,
131
- std::vector<common_lora_adapter_info> & lora_files,
132
+ std::vector<common_adapter_lora_info> & lora_files,
132
133
  std::string & outfile,
133
134
  int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
134
135
  fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@@ -265,8 +266,8 @@ struct lora_merge_ctx {
265
266
  fout.write((const char *)data.data(), data.size());
266
267
  }
267
268
 
268
- printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
269
- printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
269
+ printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
270
+ printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
270
271
  }
271
272
 
272
273
  void copy_tensor(struct ggml_tensor * base) {
@@ -352,7 +353,7 @@ struct lora_merge_ctx {
352
353
  const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
353
354
  delta = ggml_scale(ctx0, delta, scale);
354
355
  cur = ggml_add(ctx0, delta, cur);
355
- printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
356
+ printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
356
357
  printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
357
358
  }
358
359
  cur = ggml_cast(ctx0, cur, out->type);
@@ -1,10 +1,9 @@
1
1
  #include "ggml.h"
2
+ #include "gguf.h"
2
3
 
3
4
  #include <cstdio>
4
- #include <cinttypes>
5
5
  #include <string>
6
6
  #include <sstream>
7
- #include <fstream>
8
7
  #include <vector>
9
8
 
10
9
  #undef MIN
@@ -135,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) {
135
134
 
136
135
  for (int i = 0; i < n_tensors; ++i) {
137
136
  const char * name = gguf_get_tensor_name (ctx, i);
137
+ const size_t size = gguf_get_tensor_size (ctx, i);
138
138
  const size_t offset = gguf_get_tensor_offset(ctx, i);
139
139
 
140
- printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
140
+ printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
141
141
  }
142
142
  }
143
143
 
@@ -182,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
182
182
 
183
183
  for (int i = 0; i < n_tensors; ++i) {
184
184
  const char * name = gguf_get_tensor_name (ctx, i);
185
+ const size_t size = gguf_get_tensor_size (ctx, i);
185
186
  const size_t offset = gguf_get_tensor_offset(ctx, i);
186
187
 
187
- printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
188
+ printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
188
189
  }
189
190
  }
190
191
 
@@ -199,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
199
200
 
200
201
  struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
201
202
 
202
- printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
203
+ printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
204
+ __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
203
205
 
204
206
  // print first 10 elements
205
207
  const float * data = (const float *) cur->data;
@@ -215,7 +217,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
215
217
  const float * data = (const float *) cur->data;
216
218
  for (int j = 0; j < ggml_nelements(cur); ++j) {
217
219
  if (data[j] != 100 + i) {
218
- fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
220
+ fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
219
221
  gguf_free(ctx);
220
222
  return false;
221
223
  }
@@ -245,6 +247,8 @@ int main(int argc, char ** argv) {
245
247
  check_data = false;
246
248
  }
247
249
 
250
+ srand(123456);
251
+
248
252
  const std::string fname(argv[1]);
249
253
  const std::string mode (argv[2]);
250
254
 
@@ -1,4 +1,5 @@
1
1
  #include "ggml.h"
2
+ #include "gguf.h"
2
3
 
3
4
  #include <cstdlib> /* abort() */
4
5
  #include <cstddef>
@@ -1,18 +1,19 @@
1
+ #include "ggml.h"
2
+ #include "gguf.h"
1
3
  #include "llama.h"
2
4
  #include "common.h"
3
5
 
4
6
  #include <algorithm>
5
- #include <cmath>
7
+ #include <cinttypes>
8
+ #include <climits>
9
+ #include <cstdio>
6
10
  #include <cstdlib>
11
+ #include <stdexcept>
12
+ #include <cstring>
7
13
  #include <fstream>
8
14
  #include <string>
9
15
  #include <vector>
10
16
 
11
- #include <stdio.h>
12
- #include <string.h>
13
- #include <climits>
14
- #include <stdexcept>
15
-
16
17
  #if defined(_WIN32)
17
18
  #include <windows.h>
18
19
  #ifndef PATH_MAX
@@ -297,7 +298,7 @@ struct split_strategy {
297
298
  total_size += ggml_nbytes(t);
298
299
  }
299
300
  total_size = total_size / 1000 / 1000; // convert to megabytes
300
- printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
301
+ printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
301
302
  i_split++;
302
303
  }
303
304
  }
@@ -11,6 +11,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
11
11
  std::vector<std::vector<float>> result;
12
12
 
13
13
  const llama_model * model = llama_get_model(ctx);
14
+ const llama_vocab * vocab = llama_model_get_vocab(model);
14
15
 
15
16
  llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
16
17
 
@@ -19,16 +20,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
19
20
 
20
21
  const std::string input_string = instruction + sentences[i];
21
22
 
22
- std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
23
+ std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
23
24
 
24
25
  const int32_t n_toks = inputs.size();
25
26
 
26
27
  // GritLM seems to have EOS = ""
27
28
  // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
28
- // inputs.push_back(llama_token_eos(model));
29
+ // inputs.push_back(llama_vocab_eos(vocab));
29
30
 
30
31
  // we want to ignore instruction tokens for mean pooling
31
- const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
32
+ const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
32
33
 
33
34
  #ifdef GRIT_DEBUG
34
35
  // debug tokens - should be matching as referenced in the GritLM sample
@@ -52,7 +53,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
52
53
  llama_decode(ctx, batch);
53
54
 
54
55
  // get embedding dimensions
55
- uint64_t n_embd = llama_n_embd(model);
56
+ uint64_t n_embd = llama_model_n_embd(model);
56
57
 
57
58
  // allocate embedding output
58
59
  std::vector<float> emb_unorm(n_embd, 0.0f);
@@ -97,7 +98,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
97
98
  std::string result;
98
99
 
99
100
  const llama_model * model = llama_get_model(ctx);
100
- llama_token eos_token = llama_token_eos(model);
101
+ const llama_vocab * vocab = llama_model_get_vocab(model);
102
+
103
+ llama_token eos_token = llama_vocab_eos(vocab);
101
104
 
102
105
  llama_kv_cache_clear(ctx);
103
106
  llama_set_embeddings(ctx, false);
@@ -105,7 +108,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
105
108
 
106
109
  llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
107
110
 
108
- std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
111
+ std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
109
112
  int32_t i_current_token = 0;
110
113
 
111
114
  while (true) {
@@ -165,10 +168,10 @@ int main(int argc, char * argv[]) {
165
168
 
166
169
  llama_backend_init();
167
170
 
168
- llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
171
+ llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
169
172
 
170
173
  // create generation context
171
- llama_context * ctx = llama_new_context_with_model(model, cparams);
174
+ llama_context * ctx = llama_init_from_model(model, cparams);
172
175
 
173
176
  auto sparams = llama_sampler_chain_default_params();
174
177
 
@@ -197,7 +200,7 @@ int main(int argc, char * argv[]) {
197
200
  const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
198
201
  const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
199
202
 
200
- const int n_embd = llama_n_embd(model);
203
+ const int n_embd = llama_model_n_embd(model);
201
204
 
202
205
  const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
203
206
  const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@@ -219,7 +222,7 @@ int main(int argc, char * argv[]) {
219
222
 
220
223
  llama_sampler_free(smpl);
221
224
  llama_free(ctx);
222
- llama_free_model(model);
225
+ llama_model_free(model);
223
226
  llama_backend_free();
224
227
 
225
228
  return 0;
@@ -7,7 +7,6 @@
7
7
  #include <cstdio>
8
8
  #include <cstring>
9
9
  #include <ctime>
10
- #include <sstream>
11
10
  #include <thread>
12
11
  #include <mutex>
13
12
  #include <vector>
@@ -40,7 +39,7 @@ public:
40
39
  void set_params(common_params params) { m_params = std::move(params); }
41
40
  bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
42
41
  void save_imatrix(int ncall = -1) const;
43
- bool load_imatrix(const char * file_name);
42
+ bool load_imatrix(const char * fname);
44
43
  private:
45
44
  std::unordered_map<std::string, Stats> m_stats;
46
45
  common_params m_params;
@@ -429,10 +428,14 @@ static void process_logits(
429
428
  }
430
429
 
431
430
  static bool compute_imatrix(llama_context * ctx, const common_params & params) {
432
- const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
433
- GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
431
+ const llama_model * model = llama_get_model(ctx);
432
+ const llama_vocab * vocab = llama_model_get_vocab(model);
433
+
434
+ const bool add_bos = llama_vocab_get_add_bos(vocab);
434
435
  const int n_ctx = llama_n_ctx(ctx);
435
436
 
437
+ GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
438
+
436
439
  auto tim1 = std::chrono::high_resolution_clock::now();
437
440
  LOG_INF("%s: tokenizing the input ..\n", __func__);
438
441
 
@@ -467,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
467
470
  const int n_chunk_max = tokens.size() / n_ctx;
468
471
 
469
472
  const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
470
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
473
+ const int n_vocab = llama_vocab_n_tokens(vocab);
471
474
  const int n_batch = params.n_batch;
472
475
 
473
476
  int count = 0;
@@ -507,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
507
510
 
508
511
  // add BOS token for the first batch of each chunk
509
512
  if (add_bos && j == 0) {
510
- tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
513
+ tokens[batch_start] = llama_vocab_bos(vocab);
511
514
  }
512
515
 
513
516
  common_batch_clear(batch);
@@ -618,14 +621,15 @@ int main(int argc, char ** argv) {
618
621
  // init
619
622
  common_init_result llama_init = common_init_from_params(params);
620
623
 
621
- llama_model * model = llama_init.model;
622
- llama_context * ctx = llama_init.context;
624
+ llama_model * model = llama_init.model.get();
625
+ llama_context * ctx = llama_init.context.get();
626
+
623
627
  if (model == nullptr || ctx == nullptr) {
624
628
  LOG_ERR("%s : failed to init\n", __func__);
625
629
  return 1;
626
630
  }
627
631
 
628
- const int n_ctx_train = llama_n_ctx_train(model);
632
+ const int n_ctx_train = llama_model_n_ctx_train(model);
629
633
  if (params.n_ctx > n_ctx_train) {
630
634
  LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
631
635
  __func__, n_ctx_train, params.n_ctx);
@@ -655,9 +659,6 @@ int main(int argc, char ** argv) {
655
659
  LOG("\n");
656
660
  llama_perf_context_print(ctx);
657
661
 
658
- llama_free(ctx);
659
- llama_free_model(model);
660
-
661
662
  llama_backend_free();
662
663
 
663
664
  return 0;
@@ -131,15 +131,17 @@ int main(int argc, char ** argv) {
131
131
  LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
132
132
  common_init_result llama_init = common_init_from_params(params);
133
133
 
134
- model = llama_init.model;
135
- ctx = llama_init.context;
134
+ model = llama_init.model.get();
135
+ ctx = llama_init.context.get();
136
136
 
137
137
  if (model == NULL) {
138
138
  LOG_ERR("%s: unable to load model\n", __func__);
139
139
  return 1;
140
140
  }
141
141
 
142
- const int n_ctx_train = llama_n_ctx_train(model);
142
+ const llama_vocab * vocab = llama_model_get_vocab(model);
143
+
144
+ const int n_ctx_train = llama_model_n_ctx_train(model);
143
145
  const int n_ctx = llama_n_ctx(ctx);
144
146
  LOG_DBG("n_ctx: %d\n", n_ctx);
145
147
 
@@ -152,28 +154,28 @@ int main(int argc, char ** argv) {
152
154
  LOG_INF("\n");
153
155
  LOG_INF("%s\n", common_params_get_system_info(params).c_str());
154
156
  }
155
- const bool add_bos = llama_add_bos_token(model);
156
- GGML_ASSERT(!llama_add_eos_token(model));
157
+ const bool add_bos = llama_vocab_get_add_bos(vocab);
158
+ GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
157
159
 
158
160
  std::vector<llama_token> embd_inp;
159
161
  std::vector<llama_token> embd_end;
160
162
  std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
161
163
  std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
162
164
 
163
- GGML_ASSERT(llama_token_fim_pre(model) >= 0);
164
- GGML_ASSERT(llama_token_fim_suf(model) >= 0);
165
+ GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
166
+ GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
165
167
 
166
- inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
167
- inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
168
+ inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
169
+ inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
168
170
 
169
171
  embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
170
172
  embd_end = params.spm_infill ? inp_pfx : inp_sfx;
171
173
  if (add_bos) {
172
- embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
174
+ embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
173
175
  }
174
176
  embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
175
177
 
176
- const llama_token middle_token = llama_token_fim_mid(model);
178
+ const llama_token middle_token = llama_vocab_fim_mid(vocab);
177
179
  if (middle_token >= 0) {
178
180
  embd_inp.push_back(middle_token);
179
181
  }
@@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
185
187
 
186
188
  // Should not run without any tokens
187
189
  if (embd_inp.empty()) {
188
- embd_inp.push_back(llama_token_bos(model));
190
+ embd_inp.push_back(llama_vocab_bos(vocab));
189
191
  LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
190
192
  }
191
193
 
@@ -420,10 +422,10 @@ int main(int argc, char ** argv) {
420
422
  // if not currently processing queued inputs;
421
423
  if ((int) embd_inp.size() <= n_consumed) {
422
424
  // deal with eot token in infill mode
423
- if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
425
+ if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
424
426
  if (is_interacting && !params.interactive_first) {
425
427
  // print an eot token
426
- LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
428
+ LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
427
429
  }
428
430
  LOG("\n");
429
431
  console::set_display(console::user_input);
@@ -463,13 +465,13 @@ int main(int argc, char ** argv) {
463
465
  std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
464
466
  std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
465
467
 
466
- inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
467
- inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
468
+ inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
469
+ inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
468
470
 
469
471
  embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
470
472
  embd_end = params.spm_infill ? inp_pfx : inp_sfx;
471
473
  if (add_bos) {
472
- embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
474
+ embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
473
475
  }
474
476
  embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
475
477
 
@@ -484,7 +486,7 @@ int main(int argc, char ** argv) {
484
486
  is_interacting = false;
485
487
  }
486
488
  // deal with end of generation tokens in interactive mode
487
- else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
489
+ else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
488
490
  LOG_DBG("found EOS token\n");
489
491
 
490
492
  if (params.interactive) {
@@ -500,7 +502,7 @@ int main(int argc, char ** argv) {
500
502
 
501
503
  if (params.input_prefix_bos) {
502
504
  LOG_DBG("adding input prefix BOS token\n");
503
- embd_inp.push_back(llama_token_bos(model));
505
+ embd_inp.push_back(llama_vocab_bos(vocab));
504
506
  }
505
507
 
506
508
  std::string buffer;
@@ -563,7 +565,7 @@ int main(int argc, char ** argv) {
563
565
  }
564
566
 
565
567
  // end of generation
566
- if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
568
+ if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
567
569
  break;
568
570
  }
569
571
 
@@ -575,15 +577,12 @@ int main(int argc, char ** argv) {
575
577
  }
576
578
  }
577
579
  if (!params.interactive && n_remain <= 0) {
578
- LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
580
+ LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
579
581
  }
580
582
 
581
583
  LOG("\n");
582
584
  common_perf_print(ctx, smpl);
583
585
 
584
- llama_free(ctx);
585
- llama_free_model(model);
586
-
587
586
  common_sampler_free(smpl);
588
587
  llama_backend_free();
589
588
 
@@ -683,7 +683,7 @@ struct cmd_params_instance {
683
683
  bool cpu_strict;
684
684
  int poll;
685
685
  int n_gpu_layers;
686
- std::string rpc_servers;
686
+ std::string rpc_servers_str;
687
687
  llama_split_mode split_mode;
688
688
  int main_gpu;
689
689
  bool no_kv_offload;
@@ -696,8 +696,37 @@ struct cmd_params_instance {
696
696
  llama_model_params mparams = llama_model_default_params();
697
697
 
698
698
  mparams.n_gpu_layers = n_gpu_layers;
699
- if (!rpc_servers.empty()) {
700
- mparams.rpc_servers = rpc_servers.c_str();
699
+ if (!rpc_servers_str.empty()) {
700
+ auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
701
+
702
+ // add RPC devices
703
+ if (!rpc_servers.empty()) {
704
+ ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
705
+ if (!rpc_reg) {
706
+ fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
707
+ exit(1);
708
+ }
709
+
710
+ typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
711
+ ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
712
+ if (!ggml_backend_rpc_add_device_fn) {
713
+ fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
714
+ exit(1);
715
+ }
716
+ static std::vector<ggml_backend_dev_t> devices;
717
+ devices.clear();
718
+ for (const std::string & server : rpc_servers) {
719
+ ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
720
+ if (dev) {
721
+ devices.push_back(dev);
722
+ } else {
723
+ fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
724
+ exit(1);
725
+ }
726
+ }
727
+ devices.push_back(nullptr);
728
+ mparams.devices = devices.data();
729
+ }
701
730
  }
702
731
  mparams.split_mode = split_mode;
703
732
  mparams.main_gpu = main_gpu;
@@ -708,7 +737,7 @@ struct cmd_params_instance {
708
737
  }
709
738
 
710
739
  bool equal_mparams(const cmd_params_instance & other) const {
711
- return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
740
+ return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
712
741
  split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
713
742
  tensor_split == other.tensor_split;
714
743
  }
@@ -1401,7 +1430,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
1401
1430
  llama_set_n_threads(ctx, n_threads, n_threads);
1402
1431
 
1403
1432
  const llama_model * model = llama_get_model(ctx);
1404
- const int32_t n_vocab = llama_n_vocab(model);
1433
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1434
+ const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1405
1435
 
1406
1436
  std::vector<llama_token> tokens(n_batch);
1407
1437
 
@@ -1409,7 +1439,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
1409
1439
 
1410
1440
  while (n_processed < n_prompt) {
1411
1441
  int n_tokens = std::min(n_prompt - n_processed, n_batch);
1412
- tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1442
+ tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1413
1443
  for (int i = 1; i < n_tokens; i++) {
1414
1444
  tokens[i] = std::rand() % n_vocab;
1415
1445
  }
@@ -1424,9 +1454,10 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
1424
1454
  llama_set_n_threads(ctx, n_threads, n_threads);
1425
1455
 
1426
1456
  const llama_model * model = llama_get_model(ctx);
1427
- const int32_t n_vocab = llama_n_vocab(model);
1457
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1458
+ const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1428
1459
 
1429
- llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1460
+ llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1430
1461
 
1431
1462
  for (int i = 0; i < n_gen; i++) {
1432
1463
  llama_decode(ctx, llama_batch_get_one(&token, 1));
@@ -1526,10 +1557,10 @@ int main(int argc, char ** argv) {
1526
1557
  // keep the same model between tests when possible
1527
1558
  if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
1528
1559
  if (lmodel) {
1529
- llama_free_model(lmodel);
1560
+ llama_model_free(lmodel);
1530
1561
  }
1531
1562
 
1532
- lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
1563
+ lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
1533
1564
  if (lmodel == NULL) {
1534
1565
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
1535
1566
  return 1;
@@ -1537,10 +1568,10 @@ int main(int argc, char ** argv) {
1537
1568
  prev_inst = &inst;
1538
1569
  }
1539
1570
 
1540
- llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
1571
+ llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
1541
1572
  if (ctx == NULL) {
1542
1573
  fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
1543
- llama_free_model(lmodel);
1574
+ llama_model_free(lmodel);
1544
1575
  return 1;
1545
1576
  }
1546
1577
 
@@ -1626,7 +1657,7 @@ int main(int argc, char ** argv) {
1626
1657
  ggml_threadpool_free_fn(threadpool);
1627
1658
  }
1628
1659
 
1629
- llama_free_model(lmodel);
1660
+ llama_model_free(lmodel);
1630
1661
 
1631
1662
  if (p) {
1632
1663
  p->print_footer();