@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,6 +1,6 @@
1
1
  set(TARGET llama-quantize)
2
2
  add_executable(${TARGET} quantize.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_include_directories(${TARGET} PRIVATE ../../common)
6
6
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
26
26
  { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
27
27
  { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
28
28
  { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29
+ { "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
30
+ { "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
29
31
  { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
30
32
  { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
31
33
  { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
@@ -61,6 +63,16 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix
61
63
  static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
62
64
  static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
63
65
 
66
+ static bool striequals(const char * a, const char * b) {
67
+ while (*a && *b) {
68
+ if (std::tolower(*a) != std::tolower(*b)) {
69
+ return false;
70
+ }
71
+ a++; b++;
72
+ }
73
+ return *a == *b;
74
+ }
75
+
64
76
  static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
65
77
  std::string ftype_str;
66
78
 
@@ -68,7 +80,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
68
80
  ftype_str.push_back(std::toupper(ch));
69
81
  }
70
82
  for (auto & it : QUANT_OPTIONS) {
71
- if (it.name == ftype_str) {
83
+ if (striequals(it.name.c_str(), ftype_str.c_str())) {
72
84
  ftype = it.ftype;
73
85
  ftype_str_out = it.name;
74
86
  return true;
@@ -91,7 +103,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
91
103
  }
92
104
 
93
105
  // usage:
94
- // ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
106
+ // ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
95
107
  //
96
108
  [[noreturn]]
97
109
  static void usage(const char * executable) {
@@ -104,7 +116,7 @@ static void usage(const char * executable) {
104
116
  printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
105
117
  printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
106
118
  printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
107
- printf(" --keep-split: will generate quatized model in the same shards as input");
119
+ printf(" --keep-split: will generate quantized model in the same shards as input\n");
108
120
  printf(" --override-kv KEY=TYPE:VALUE\n");
109
121
  printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
110
122
  printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -223,15 +235,15 @@ static int prepare_imatrix(const std::string & imatrix_file,
223
235
  }
224
236
 
225
237
  static ggml_type parse_ggml_type(const char * arg) {
226
- ggml_type result = GGML_TYPE_COUNT;
227
- for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
228
- auto type = ggml_type(j);
238
+ for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
239
+ auto type = (ggml_type)i;
229
240
  const auto * name = ggml_type_name(type);
230
- if (name && strcmp(arg, name) == 0) {
231
- result = type; break;
241
+ if (name && striequals(name, arg)) {
242
+ return type;
232
243
  }
233
244
  }
234
- return result;
245
+ fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
246
+ return GGML_TYPE_COUNT;
235
247
  }
236
248
 
237
249
  int main(int argc, char ** argv) {
@@ -252,12 +264,18 @@ int main(int argc, char ** argv) {
252
264
  } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
253
265
  if (arg_idx < argc-1) {
254
266
  params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
267
+ if (params.output_tensor_type == GGML_TYPE_COUNT) {
268
+ usage(argv[0]);
269
+ }
255
270
  } else {
256
271
  usage(argv[0]);
257
272
  }
258
273
  } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
259
274
  if (arg_idx < argc-1) {
260
275
  params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
276
+ if (params.token_embedding_type == GGML_TYPE_COUNT) {
277
+ usage(argv[0]);
278
+ }
261
279
  } else {
262
280
  usage(argv[0]);
263
281
  }
@@ -1,7 +1,7 @@
1
- #define LLAMA_API_INTERNAL
2
1
  #include "common.h"
3
2
  #include "ggml.h"
4
3
  #include "llama.h"
4
+ #include "llama-impl.h"
5
5
 
6
6
  #include <algorithm>
7
7
  #include <cassert>
@@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
319
319
  }
320
320
 
321
321
  auto cparams = llama_context_default_params();
322
- cparams.n_ctx = 256;
323
- cparams.seed = 1;
322
+ cparams.n_ctx = 256;
324
323
 
325
324
  ctx = llama_new_context_with_model(model, cparams);
326
325
 
@@ -1,15 +1,16 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <algorithm>
5
7
  #include <fstream>
8
+ #include <iostream> // TODO: remove me
6
9
 
7
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
8
- gpt_params_print_usage(argc, argv, params);
9
-
10
- LOG_TEE("\nexample usage:\n");
11
- LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
12
- LOG_TEE("\n");
10
+ static void print_usage(int, char ** argv) {
11
+ LOG("\nexample usage:\n");
12
+ LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
13
+ LOG("\n");
13
14
  }
14
15
 
15
16
  struct chunk {
@@ -18,7 +19,7 @@ struct chunk {
18
19
  // original file position
19
20
  size_t filepos;
20
21
  // original text data
21
- std::string textdata = "";
22
+ std::string textdata;
22
23
  // tokenized text data
23
24
  std::vector<llama_token> tokens;
24
25
  // embedding
@@ -32,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
32
33
  std::ifstream f(filename.c_str());
33
34
 
34
35
  if (!f.is_open()) {
35
- fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
36
+ LOG_ERR("could not open file %s\n", filename.c_str());
36
37
  return chunks;
37
38
  }
38
39
 
39
40
  chunk current_chunk;
40
41
  char buffer[1024];
41
42
  int64_t filepos = 0;
42
- std::string current = "";
43
+ std::string current;
43
44
  while (f.read(buffer, 1024)) {
44
45
  current += std::string(buffer, f.gcount());
45
46
  size_t pos;
@@ -85,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
85
86
  llama_kv_cache_clear(ctx);
86
87
 
87
88
  // run model
88
- fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
89
+ LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
89
90
  if (llama_decode(ctx, batch) < 0) {
90
- fprintf(stderr, "%s : failed to decode\n", __func__);
91
+ LOG_ERR("%s : failed to decode\n", __func__);
91
92
  }
92
93
 
93
94
  for (int i = 0; i < batch.n_tokens; i++) {
@@ -100,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
100
101
  if (embd == NULL) {
101
102
  embd = llama_get_embeddings_ith(ctx, i);
102
103
  if (embd == NULL) {
103
- fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
104
+ LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
104
105
  continue;
105
106
  }
106
107
  }
@@ -113,29 +114,28 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
113
114
  int main(int argc, char ** argv) {
114
115
  gpt_params params;
115
116
 
116
- if (!gpt_params_parse(argc, argv, params)) {
117
- print_usage(argc, argv, params);
117
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
118
118
  return 1;
119
119
  }
120
120
 
121
+ gpt_init();
122
+
121
123
  // For BERT models, batch size must be equal to ubatch size
122
124
  params.n_ubatch = params.n_batch;
123
125
  params.embedding = true;
124
126
 
125
127
  if (params.chunk_size <= 0) {
126
- fprintf(stderr, "chunk_size must be positive\n");
128
+ LOG_ERR("chunk_size must be positive\n");
127
129
  return 1;
128
130
  }
129
131
  if (params.context_files.empty()) {
130
- fprintf(stderr, "context_files must be specified\n");
132
+ LOG_ERR("context_files must be specified\n");
131
133
  return 1;
132
134
  }
133
135
 
134
- print_build_info();
135
-
136
- printf("processing files:\n");
136
+ LOG_INF("processing files:\n");
137
137
  for (auto & context_file : params.context_files) {
138
- printf("%s\n", context_file.c_str());
138
+ LOG_INF("%s\n", context_file.c_str());
139
139
  }
140
140
 
141
141
  std::vector<chunk> chunks;
@@ -143,18 +143,19 @@ int main(int argc, char ** argv) {
143
143
  std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
144
144
  chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
145
145
  }
146
- printf("Number of chunks: %ld\n", chunks.size());
146
+ LOG_INF("Number of chunks: %ld\n", chunks.size());
147
147
 
148
148
  llama_backend_init();
149
149
  llama_numa_init(params.numa);
150
150
 
151
- llama_model * model;
152
- llama_context * ctx;
153
-
154
151
  // load the model
155
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
152
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
153
+
154
+ llama_model * model = llama_init.model;
155
+ llama_context * ctx = llama_init.context;
156
+
156
157
  if (model == NULL) {
157
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
158
+ LOG_ERR("%s: unable to load model\n", __func__);
158
159
  return 1;
159
160
  }
160
161
 
@@ -163,19 +164,19 @@ int main(int argc, char ** argv) {
163
164
 
164
165
  const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
165
166
  if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
166
- fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
167
+ LOG_ERR("%s: pooling type NONE not supported\n", __func__);
167
168
  return 1;
168
169
  }
169
170
 
170
171
  if (n_ctx > n_ctx_train) {
171
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
172
+ LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
172
173
  __func__, n_ctx_train, n_ctx);
173
174
  }
174
175
 
175
176
  // print system information
176
177
  {
177
- fprintf(stderr, "\n");
178
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
178
+ LOG_INF("\n");
179
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
179
180
  }
180
181
 
181
182
  // max batch size
@@ -186,7 +187,7 @@ int main(int argc, char ** argv) {
186
187
  for (auto & chunk : chunks) {
187
188
  auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
188
189
  if (inp.size() > n_batch) {
189
- fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
190
+ LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
190
191
  __func__, (long long int) inp.size(), (long long int) n_batch);
191
192
  return 1;
192
193
  }
@@ -200,12 +201,12 @@ int main(int argc, char ** argv) {
200
201
  // tokenization stats
201
202
  if (params.verbose_prompt) {
202
203
  for (int i = 0; i < (int) chunks.size(); i++) {
203
- fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
204
- fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
204
+ LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
205
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
205
206
  for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
206
- fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
207
+ LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
207
208
  }
208
- fprintf(stderr, "\n\n");
209
+ LOG_INF("\n\n");
209
210
  }
210
211
  }
211
212
 
@@ -252,14 +253,15 @@ int main(int argc, char ** argv) {
252
253
  chunks[i].tokens.clear();
253
254
  }
254
255
 
256
+ struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
257
+
255
258
  // start loop, receive query and return top k similar chunks based on cosine similarity
256
259
  std::string query;
257
260
  while (true) {
258
- printf("Enter query: ");
261
+ LOG("Enter query: ");
259
262
  std::getline(std::cin, query);
260
263
  std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
261
264
 
262
- struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
263
265
  batch_add_seq(query_batch, query_tokens, 0);
264
266
 
265
267
  std::vector<float> query_emb(n_embd, 0);
@@ -280,19 +282,22 @@ int main(int argc, char ** argv) {
280
282
  return a.second > b.second;
281
283
  });
282
284
 
283
- printf("Top %d similar chunks:\n", params.sparams.top_k);
285
+ LOG("Top %d similar chunks:\n", params.sparams.top_k);
284
286
  for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
285
- printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
286
- printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
287
- printf("similarity: %f\n", similarities[i].second);
288
- printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
289
- printf("--------------------\n");
287
+ LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
288
+ LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
289
+ LOG("similarity: %f\n", similarities[i].second);
290
+ LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
291
+ LOG("--------------------\n");
290
292
  }
291
293
  }
292
294
  }
293
295
 
296
+ LOG("\n");
297
+ llama_perf_context_print(ctx);
298
+
294
299
  // clean up
295
- llama_print_timings(ctx);
300
+ llama_batch_free(query_batch);
296
301
  llama_free(ctx);
297
302
  llama_free_model(model);
298
303
  llama_backend_free();
@@ -6,6 +6,10 @@
6
6
  #include "ggml-metal.h"
7
7
  #endif
8
8
 
9
+ #ifdef GGML_USE_VULKAN
10
+ #include "ggml-vulkan.h"
11
+ #endif
12
+
9
13
  #include "ggml-rpc.h"
10
14
  #ifdef _WIN32
11
15
  # include <windows.h>
@@ -16,7 +20,7 @@
16
20
  #include <stdio.h>
17
21
 
18
22
  struct rpc_server_params {
19
- std::string host = "0.0.0.0";
23
+ std::string host = "127.0.0.1";
20
24
  int port = 50052;
21
25
  size_t backend_mem = 0;
22
26
  };
@@ -79,6 +83,12 @@ static ggml_backend_t create_backend() {
79
83
  if (!backend) {
80
84
  fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
81
85
  }
86
+ #elif GGML_USE_VULKAN
87
+ fprintf(stderr, "%s: using Vulkan backend\n", __func__);
88
+ backend = ggml_backend_vk_init(0); // init device 0
89
+ if (!backend) {
90
+ fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
91
+ }
82
92
  #endif
83
93
 
84
94
  // if there aren't GPU Backends fallback to CPU backend
@@ -92,6 +102,8 @@ static ggml_backend_t create_backend() {
92
102
  static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
93
103
  #ifdef GGML_USE_CUDA
94
104
  ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
105
+ #elif GGML_USE_VULKAN
106
+ ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
95
107
  #else
96
108
  #ifdef _WIN32
97
109
  MEMORYSTATUSEX status;
@@ -114,6 +126,17 @@ int main(int argc, char * argv[]) {
114
126
  fprintf(stderr, "Invalid parameters\n");
115
127
  return 1;
116
128
  }
129
+
130
+ if (params.host != "127.0.0.1") {
131
+ fprintf(stderr, "\n");
132
+ fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
133
+ fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
134
+ fprintf(stderr, " Never expose the RPC server to an open network!\n");
135
+ fprintf(stderr, " This is an experimental feature and is not secure!\n");
136
+ fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
137
+ fprintf(stderr, "\n");
138
+ }
139
+
117
140
  ggml_backend_t backend = create_backend();
118
141
  if (!backend) {
119
142
  fprintf(stderr, "Failed to create backend\n");
@@ -1,17 +1,17 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
2
3
  #include "llama.h"
3
4
 
4
5
  #include <vector>
5
6
  #include <cstdio>
6
- #include <chrono>
7
7
 
8
8
  int main(int argc, char ** argv) {
9
9
  gpt_params params;
10
10
 
11
11
  params.prompt = "The quick brown fox";
12
+ params.sparams.seed = 1234;
12
13
 
13
- if (!gpt_params_parse(argc, argv, params)) {
14
- gpt_params_print_usage(argc, argv, params);
14
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
15
15
  return 1;
16
16
  }
17
17
 
@@ -28,15 +28,23 @@ int main(int argc, char ** argv) {
28
28
  std::string result2;
29
29
 
30
30
  // init
31
- llama_model * model;
32
- llama_context * ctx;
31
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
32
+
33
+ llama_model * model = llama_init.model;
34
+ llama_context * ctx = llama_init.context;
33
35
 
34
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
35
36
  if (model == nullptr || ctx == nullptr) {
36
37
  fprintf(stderr, "%s : failed to init\n", __func__);
37
38
  return 1;
38
39
  }
39
40
 
41
+ auto sparams = llama_sampler_chain_default_params();
42
+
43
+ llama_sampler * smpl = llama_sampler_chain_init(sparams);
44
+
45
+ llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
46
+ llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
47
+
40
48
  // tokenize prompt
41
49
  auto tokens = llama_tokenize(ctx, params.prompt, true);
42
50
 
@@ -63,16 +71,7 @@ int main(int argc, char ** argv) {
63
71
  printf("\nfirst run: %s", params.prompt.c_str());
64
72
 
65
73
  for (auto i = 0; i < params.n_predict; i++) {
66
- auto * logits = llama_get_logits(ctx);
67
- auto n_vocab = llama_n_vocab(model);
68
-
69
- std::vector<llama_token_data> candidates;
70
- candidates.reserve(n_vocab);
71
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
72
- candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
73
- }
74
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
75
- auto next_token = llama_sample_token(ctx, &candidates_p);
74
+ auto next_token = llama_sampler_sample(smpl, ctx, -1);
76
75
  auto next_token_str = llama_token_to_piece(ctx, next_token);
77
76
 
78
77
  printf("%s", next_token_str.c_str());
@@ -95,6 +94,11 @@ int main(int argc, char ** argv) {
95
94
  // make new context
96
95
  auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
97
96
 
97
+ llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
98
+
99
+ llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
100
+ llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
101
+
98
102
  printf("\nsecond run: %s", params.prompt.c_str());
99
103
 
100
104
  // load state (rng, logits, embedding and kv_cache) from file
@@ -123,15 +127,7 @@ int main(int argc, char ** argv) {
123
127
 
124
128
  // second run
125
129
  for (auto i = 0; i < params.n_predict; i++) {
126
- auto * logits = llama_get_logits(ctx2);
127
- auto n_vocab = llama_n_vocab(model);
128
- std::vector<llama_token_data> candidates;
129
- candidates.reserve(n_vocab);
130
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
131
- candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
132
- }
133
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
134
- auto next_token = llama_sample_token(ctx2, &candidates_p);
130
+ auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
135
131
  auto next_token_str = llama_token_to_piece(ctx2, next_token);
136
132
 
137
133
  printf("%s", next_token_str.c_str());
@@ -156,7 +152,12 @@ int main(int argc, char ** argv) {
156
152
  }
157
153
 
158
154
  // make new context
159
- auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
155
+ auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
156
+
157
+ llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
158
+
159
+ llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
160
+ llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
160
161
 
161
162
  printf("\nsingle seq run: %s", params.prompt.c_str());
162
163
 
@@ -214,15 +215,7 @@ int main(int argc, char ** argv) {
214
215
 
215
216
  // third run with seq 1 instead of 0
216
217
  for (auto i = 0; i < params.n_predict; i++) {
217
- auto * logits = llama_get_logits(ctx3);
218
- auto n_vocab = llama_n_vocab(model);
219
- std::vector<llama_token_data> candidates;
220
- candidates.reserve(n_vocab);
221
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
222
- candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
223
- }
224
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
225
- auto next_token = llama_sample_token(ctx3, &candidates_p);
218
+ auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
226
219
  auto next_token_str = llama_token_to_piece(ctx3, next_token);
227
220
 
228
221
  printf("%s", next_token_str.c_str());
@@ -239,6 +232,10 @@ int main(int argc, char ** argv) {
239
232
 
240
233
  printf("\n");
241
234
 
235
+ llama_sampler_free(smpl);
236
+ llama_sampler_free(smpl2);
237
+ llama_sampler_free(smpl3);
238
+
242
239
  llama_free(ctx3);
243
240
  llama_free_model(model);
244
241
 
@@ -1,6 +1,6 @@
1
1
  set(TARGET llama-server)
2
- option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
3
- option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
2
+
3
+ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
4
4
 
5
5
  include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
6
6
 
@@ -30,6 +30,7 @@ set(PUBLIC_ASSETS
30
30
  system-prompts.js
31
31
  prompt-formats.js
32
32
  json-schema-to-grammar.mjs
33
+ loading.html
33
34
  )
34
35
 
35
36
  foreach(asset ${PUBLIC_ASSETS})
@@ -45,9 +46,6 @@ endforeach()
45
46
 
46
47
  add_executable(${TARGET} ${TARGET_SRCS})
47
48
  install(TARGETS ${TARGET} RUNTIME)
48
- target_compile_definitions(${TARGET} PRIVATE
49
- SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
50
- )
51
49
 
52
50
  target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
53
51