@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
142
142
  }
143
143
 
144
144
  static void test_roundtrip_on_chunk(
145
- const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
145
+ const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
146
146
  float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
147
147
  ) {
148
148
  if (layer->type == GGML_TYPE_F16) {
@@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
156
156
  if (use_reference) {
157
157
  qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
158
158
  } else {
159
- qfns.from_float(input_scratch, quantized_scratch, chunk_size);
159
+ qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
160
160
  }
161
161
  qfns.to_float(quantized_scratch, output_scratch, chunk_size);
162
162
 
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
166
166
 
167
167
  // Run quantization function for a single layer and update error stats
168
168
  static void test_roundtrip_on_layer(
169
- std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
169
+ std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
170
170
  const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
171
171
  std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
172
172
  ) {
@@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
187
187
  int num_chunks = (nelements + chunk_size - 1)/chunk_size;
188
188
 
189
189
  if (num_chunks < 2 || max_thread < 2) {
190
- test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
190
+ test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
191
191
  output_scratch.data(), print_layer_stats ? layer_error : total_error);
192
192
  } else {
193
193
  auto & stats = print_layer_stats ? layer_error : total_error;
194
194
  std::mutex mutex;
195
195
  uint64_t counter = 0;
196
- auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
196
+ auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
197
197
  &quantized_scratch, &output_scratch, chunk_size] () {
198
198
  error_stats local_stats {};
199
199
  while (true) {
@@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
205
205
  }
206
206
  lock.unlock();
207
207
  uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
208
- test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
208
+ test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
209
209
  quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
210
210
  }
211
211
  };
@@ -371,8 +371,9 @@ int main(int argc, char ** argv) {
371
371
  if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
372
372
  continue;
373
373
  }
374
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
375
- if (qfns.from_float && qfns.to_float) {
374
+ const auto * qfns = ggml_get_type_traits(type);
375
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
376
+ if (qfns_cpu->from_float && qfns->to_float) {
376
377
  if (params.verbose) {
377
378
  printf("testing %s ...\n", ggml_type_name(type));
378
379
  }
@@ -393,7 +394,7 @@ int main(int argc, char ** argv) {
393
394
  test_roundtrip_on_layer(
394
395
  layer_name,
395
396
  params.per_layer_stats,
396
- qfns,
397
+ *qfns, *qfns_cpu,
397
398
  params.reference,
398
399
  kv_tensor.second,
399
400
  input_scratch,
@@ -77,7 +77,7 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
77
77
  static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
78
78
  size_t n_tokens = tokens.size();
79
79
  for (size_t i = 0; i < n_tokens; i++) {
80
- llama_batch_add(batch, tokens[i], i, { seq_id }, true);
80
+ common_batch_add(batch, tokens[i], i, { seq_id }, true);
81
81
  }
82
82
  }
83
83
 
@@ -107,18 +107,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
107
107
  }
108
108
 
109
109
  float * out = output + batch.seq_id[i][0] * n_embd;
110
- llama_embd_normalize(embd, out, n_embd);
110
+ common_embd_normalize(embd, out, n_embd);
111
111
  }
112
112
  }
113
113
 
114
114
  int main(int argc, char ** argv) {
115
- gpt_params params;
115
+ common_params params;
116
116
 
117
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
117
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
118
118
  return 1;
119
119
  }
120
120
 
121
- gpt_init();
121
+ common_init();
122
122
 
123
123
  // For BERT models, batch size must be equal to ubatch size
124
124
  params.n_ubatch = params.n_batch;
@@ -149,7 +149,7 @@ int main(int argc, char ** argv) {
149
149
  llama_numa_init(params.numa);
150
150
 
151
151
  // load the model
152
- llama_init_result llama_init = llama_init_from_gpt_params(params);
152
+ common_init_result llama_init = common_init_from_params(params);
153
153
 
154
154
  llama_model * model = llama_init.model;
155
155
  llama_context * ctx = llama_init.context;
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
176
176
  // print system information
177
177
  {
178
178
  LOG_INF("\n");
179
- LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
179
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
180
180
  }
181
181
 
182
182
  // max batch size
@@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
185
185
 
186
186
  // tokenize the prompts and trim
187
187
  for (auto & chunk : chunks) {
188
- auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
188
+ auto inp = common_tokenize(ctx, chunk.textdata, true, false);
189
189
  if (inp.size() > n_batch) {
190
190
  LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
191
191
  __func__, (long long int) inp.size(), (long long int) n_batch);
@@ -204,7 +204,7 @@ int main(int argc, char ** argv) {
204
204
  LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
205
205
  LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
206
206
  for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
207
- LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
207
+ LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
208
208
  }
209
209
  LOG_INF("\n\n");
210
210
  }
@@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
232
232
  if (batch.n_tokens + n_toks > n_batch) {
233
233
  float * out = emb + p * n_embd;
234
234
  batch_decode(ctx, batch, out, s, n_embd);
235
- llama_batch_clear(batch);
235
+ common_batch_clear(batch);
236
236
  p += s;
237
237
  s = 0;
238
238
  }
@@ -260,20 +260,20 @@ int main(int argc, char ** argv) {
260
260
  while (true) {
261
261
  LOG("Enter query: ");
262
262
  std::getline(std::cin, query);
263
- std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
263
+ std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
264
264
 
265
265
  batch_add_seq(query_batch, query_tokens, 0);
266
266
 
267
267
  std::vector<float> query_emb(n_embd, 0);
268
268
  batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
269
269
 
270
- llama_batch_clear(query_batch);
270
+ common_batch_clear(query_batch);
271
271
 
272
272
  // compute cosine similarities
273
273
  {
274
274
  std::vector<std::pair<int, float>> similarities;
275
275
  for (int i = 0; i < n_chunks; i++) {
276
- float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
276
+ float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
277
277
  similarities.push_back(std::make_pair(i, sim));
278
278
  }
279
279
 
@@ -1,3 +1,5 @@
1
+ #include "ggml-cpu.h"
2
+
1
3
  #ifdef GGML_USE_CUDA
2
4
  #include "ggml-cuda.h"
3
5
  #endif
@@ -151,7 +153,7 @@ int main(int argc, char * argv[]) {
151
153
  get_backend_memory(&free_mem, &total_mem);
152
154
  }
153
155
  printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
154
- start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
156
+ ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
155
157
  ggml_backend_free(backend);
156
158
  return 0;
157
159
  }
@@ -6,12 +6,12 @@
6
6
  #include <cstdio>
7
7
 
8
8
  int main(int argc, char ** argv) {
9
- gpt_params params;
9
+ common_params params;
10
10
 
11
11
  params.prompt = "The quick brown fox";
12
12
  params.sparams.seed = 1234;
13
13
 
14
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
14
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
15
15
  return 1;
16
16
  }
17
17
 
@@ -28,7 +28,7 @@ int main(int argc, char ** argv) {
28
28
  std::string result2;
29
29
 
30
30
  // init
31
- llama_init_result llama_init = llama_init_from_gpt_params(params);
31
+ common_init_result llama_init = common_init_from_params(params);
32
32
 
33
33
  llama_model * model = llama_init.model;
34
34
  llama_context * ctx = llama_init.context;
@@ -42,15 +42,21 @@ int main(int argc, char ** argv) {
42
42
 
43
43
  llama_sampler * smpl = llama_sampler_chain_init(sparams);
44
44
 
45
- llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
46
45
  llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
47
46
 
48
47
  // tokenize prompt
49
- auto tokens = llama_tokenize(ctx, params.prompt, true);
48
+ auto tokens = common_tokenize(ctx, params.prompt, true);
49
+
50
+ // prepare the batch
51
+ llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
52
+ for (size_t i = 0; i < tokens.size(); i++) {
53
+ common_batch_add(batch, tokens[i], i, {0}, false);
54
+ }
55
+ batch.logits[batch.n_tokens - 1] = true; // generate next token
50
56
 
51
57
  // evaluate prompt
52
- llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
53
- n_past += tokens.size();
58
+ llama_decode(ctx, batch);
59
+ n_past += batch.n_tokens;
54
60
 
55
61
  // save state (rng, logits, embedding and kv_cache) to file
56
62
  {
@@ -72,13 +78,17 @@ int main(int argc, char ** argv) {
72
78
 
73
79
  for (auto i = 0; i < params.n_predict; i++) {
74
80
  auto next_token = llama_sampler_sample(smpl, ctx, -1);
75
- auto next_token_str = llama_token_to_piece(ctx, next_token);
81
+ auto next_token_str = common_token_to_piece(ctx, next_token);
76
82
 
77
83
  printf("%s", next_token_str.c_str());
78
84
  result0 += next_token_str;
79
85
 
80
- if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
86
+ common_batch_clear(batch);
87
+ common_batch_add(batch, next_token, n_past, {0}, true);
88
+
89
+ if (llama_decode(ctx, batch)) {
81
90
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
91
+ llama_batch_free(batch);
82
92
  llama_free(ctx);
83
93
  llama_free_model(model);
84
94
  return 1;
@@ -92,11 +102,10 @@ int main(int argc, char ** argv) {
92
102
  llama_free(ctx);
93
103
 
94
104
  // make new context
95
- auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
105
+ auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
96
106
 
97
107
  llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
98
108
 
99
- llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
100
109
  llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
101
110
 
102
111
  printf("\nsecond run: %s", params.prompt.c_str());
@@ -128,13 +137,17 @@ int main(int argc, char ** argv) {
128
137
  // second run
129
138
  for (auto i = 0; i < params.n_predict; i++) {
130
139
  auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
131
- auto next_token_str = llama_token_to_piece(ctx2, next_token);
140
+ auto next_token_str = common_token_to_piece(ctx2, next_token);
132
141
 
133
142
  printf("%s", next_token_str.c_str());
134
143
  result1 += next_token_str;
135
144
 
136
- if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
145
+ common_batch_clear(batch);
146
+ common_batch_add(batch, next_token, n_past, {0}, true);
147
+
148
+ if (llama_decode(ctx2, batch)) {
137
149
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
150
+ llama_batch_free(batch);
138
151
  llama_free(ctx2);
139
152
  llama_free_model(model);
140
153
  return 1;
@@ -152,11 +165,10 @@ int main(int argc, char ** argv) {
152
165
  }
153
166
 
154
167
  // make new context
155
- auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
168
+ auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
156
169
 
157
170
  llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
158
171
 
159
- llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
160
172
  llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
161
173
 
162
174
  printf("\nsingle seq run: %s", params.prompt.c_str());
@@ -216,13 +228,17 @@ int main(int argc, char ** argv) {
216
228
  // third run with seq 1 instead of 0
217
229
  for (auto i = 0; i < params.n_predict; i++) {
218
230
  auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
219
- auto next_token_str = llama_token_to_piece(ctx3, next_token);
231
+ auto next_token_str = common_token_to_piece(ctx3, next_token);
220
232
 
221
233
  printf("%s", next_token_str.c_str());
222
234
  result2 += next_token_str;
223
235
 
224
- if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
236
+ common_batch_clear(batch);
237
+ common_batch_add(batch, next_token, n_past, {1}, true);
238
+
239
+ if (llama_decode(ctx3, batch)) {
225
240
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
241
+ llama_batch_free(batch);
226
242
  llama_free(ctx3);
227
243
  llama_free_model(model);
228
244
  return 1;
@@ -236,6 +252,7 @@ int main(int argc, char ** argv) {
236
252
  llama_sampler_free(smpl2);
237
253
  llama_sampler_free(smpl3);
238
254
 
255
+ llama_batch_free(batch);
239
256
  llama_free(ctx3);
240
257
  llama_free_model(model);
241
258
 
@@ -15,22 +15,13 @@ set(TARGET_SRCS
15
15
  httplib.h
16
16
  )
17
17
  set(PUBLIC_ASSETS
18
- colorthemes.css
19
- style.css
20
- theme-beeninorder.css
21
- theme-ketivah.css
22
- theme-mangotango.css
23
- theme-playground.css
24
- theme-polarnight.css
25
- theme-snowstorm.css
26
18
  index.html
27
- index-new.html
28
- index.js
29
19
  completion.js
30
- system-prompts.js
31
- prompt-formats.js
32
- json-schema-to-grammar.mjs
33
20
  loading.html
21
+ deps_daisyui.min.css
22
+ deps_markdown-it.js
23
+ deps_tailwindcss.js
24
+ deps_vue.esm-browser.js
34
25
  )
35
26
 
36
27
  foreach(asset ${PUBLIC_ASSETS})