@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,11 +1,16 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "sampling.h"
4
+ #include "log.h"
2
5
  #include "llama.h"
3
6
 
4
- #include <cmath>
7
+ #include <algorithm>
5
8
  #include <cstdio>
9
+ #include <cstring>
10
+ #include <random>
11
+ #include <set>
6
12
  #include <string>
7
13
  #include <vector>
8
- #include <set>
9
14
 
10
15
  #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
11
16
  #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -21,19 +26,23 @@ struct seq_draft {
21
26
  std::vector<llama_token> tokens;
22
27
  std::vector<std::vector<llama_token_data>> dists;
23
28
 
24
- struct llama_sampling_context * ctx_sampling;
29
+ struct gpt_sampler * smpl = nullptr;
25
30
  };
26
31
 
27
32
  int main(int argc, char ** argv) {
28
33
  gpt_params params;
29
34
 
30
- if (!gpt_params_parse(argc, argv, params)) {
31
- gpt_params_print_usage(argc, argv, params);
35
+ // needed to get candidate probs even for temp <= 0.0
36
+ params.sparams.n_probs = 128;
37
+
38
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
32
39
  return 1;
33
40
  }
34
41
 
42
+ gpt_init();
43
+
35
44
  if (params.model_draft.empty()) {
36
- fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
45
+ LOG_ERR("%s: --model-draft is required\n", __func__);
37
46
  return 1;
38
47
  }
39
48
 
@@ -43,18 +52,9 @@ int main(int argc, char ** argv) {
43
52
  // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
44
53
  const float p_split = params.p_split;
45
54
 
46
- if (params.seed == LLAMA_DEFAULT_SEED) {
47
- params.seed = time(NULL);
48
- }
49
- std::default_random_engine rng(params.seed);
55
+ std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
50
56
  std::uniform_real_distribution<> u_dist;
51
57
 
52
- #ifndef LOG_DISABLE_LOGS
53
- log_set_target(log_filename_generator("speculative", "log"));
54
- LOG_TEE("Log start\n");
55
- log_dump_cmdline(argc, argv);
56
- #endif // LOG_DISABLE_LOGS
57
-
58
58
  // init llama.cpp
59
59
  llama_backend_init();
60
60
  llama_numa_init(params.numa);
@@ -66,26 +66,31 @@ int main(int argc, char ** argv) {
66
66
  llama_context * ctx_dft = NULL;
67
67
 
68
68
  // load the target model
69
- std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
69
+ llama_init_result llama_init_tgt = llama_init_from_gpt_params(params);
70
+ model_tgt = llama_init_tgt.model;
71
+ ctx_tgt = llama_init_tgt.context;
70
72
 
71
73
  // load the draft model
72
74
  params.model = params.model_draft;
73
75
  params.n_gpu_layers = params.n_gpu_layers_draft;
74
- if (params.n_threads_draft > 0) {
75
- params.n_threads = params.n_threads_draft;
76
+ if (params.draft_cpuparams.n_threads > 0) {
77
+ params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
76
78
  }
77
- params.n_threads_batch = params.n_threads_batch_draft;
78
- std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
79
+
80
+ params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
81
+ llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
82
+ model_dft = llama_init_dft.model;
83
+ ctx_dft = llama_init_dft.context;
79
84
 
80
85
  const bool vocab_type_tgt = llama_vocab_type(model_tgt);
81
- LOG("vocab_type tgt: %d\n", vocab_type_tgt);
86
+ LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
82
87
 
83
88
  const bool vocab_type_dft = llama_vocab_type(model_dft);
84
- LOG("vocab_type dft: %d\n", vocab_type_dft);
89
+ LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
85
90
 
86
91
  if (vocab_type_tgt != vocab_type_dft) {
87
- fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
88
- fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
92
+ LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
93
+ LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
89
94
  return 1;
90
95
  }
91
96
 
@@ -95,7 +100,7 @@ int main(int argc, char ** argv) {
95
100
  llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
96
101
  llama_token_eos(model_tgt) != llama_token_eos(model_dft)
97
102
  ) {
98
- fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
103
+ LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
99
104
  return 1;
100
105
  }
101
106
 
@@ -107,8 +112,8 @@ int main(int argc, char ** argv) {
107
112
  : n_vocab_dft - n_vocab_tgt;
108
113
 
109
114
  if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
110
- fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
111
- fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
115
+ LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
116
+ LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
112
117
  n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
113
118
  return 1;
114
119
  }
@@ -117,8 +122,8 @@ int main(int argc, char ** argv) {
117
122
  const char * token_text_tgt = llama_token_get_text(model_tgt, i);
118
123
  const char * token_text_dft = llama_token_get_text(model_dft, i);
119
124
  if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
120
- fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
121
- fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
125
+ LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
126
+ LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
122
127
  llama_token_to_piece(ctx_tgt, i).c_str(),
123
128
  llama_token_to_piece(ctx_dft, i).c_str());
124
129
  return 1;
@@ -135,18 +140,16 @@ int main(int argc, char ** argv) {
135
140
  const int max_tokens_list_size = max_context_size - 4;
136
141
 
137
142
  if ((int) inp.size() > max_tokens_list_size) {
138
- fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
143
+ LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
139
144
  return 1;
140
145
  }
141
146
 
142
- fprintf(stderr, "\n\n");
147
+ LOG("\n\n");
143
148
 
144
149
  for (auto id : inp) {
145
- fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
150
+ LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
146
151
  }
147
152
 
148
- fflush(stderr);
149
-
150
153
  const int n_input = inp.size();
151
154
 
152
155
  const auto t_enc_start = ggml_time_us();
@@ -174,19 +177,17 @@ int main(int argc, char ** argv) {
174
177
  // used to determine end of generation
175
178
  bool has_eos = false;
176
179
 
177
- // target model sampling context
178
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
180
+ // target model sampling context (reuse the llama_context's sampling instance)
181
+ struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
182
+
183
+ struct llama_sampler * softmax = llama_sampler_init_softmax();
179
184
 
180
185
  // draft sequence data
181
186
  std::vector<seq_draft> drafts(n_seq_dft);
182
187
 
183
- params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
184
- if (params.sparams.temp == 0) {
185
- params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
186
- }
187
-
188
188
  for (int s = 0; s < n_seq_dft; ++s) {
189
- drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
189
+ // allocate gpt_sampler for each draft sequence
190
+ drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
190
191
  }
191
192
 
192
193
  llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@@ -210,7 +211,7 @@ int main(int argc, char ** argv) {
210
211
  active_seqs.insert(s);
211
212
  const auto & tokens = drafts[s].tokens;
212
213
 
213
- LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
214
+ LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
214
215
  }
215
216
 
216
217
  int i_dft = 0;
@@ -228,12 +229,12 @@ int main(int argc, char ** argv) {
228
229
  bool accept = false;
229
230
  if (params.sparams.temp > 0) {
230
231
  // stochastic verification
232
+ gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
231
233
 
232
- llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
233
- llama_sample_softmax(ctx_tgt, &dist_tgt);
234
- float p_tgt = 0, p_dft = 0;
234
+ auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
235
235
 
236
- // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
236
+ float p_tgt = 0.0f;
237
+ float p_dft = 0.0f;
237
238
 
238
239
  while (active_seqs.size() > 0) {
239
240
  // randomly select a sequence to verify from active sequences
@@ -252,9 +253,13 @@ int main(int argc, char ** argv) {
252
253
  }
253
254
  continue;
254
255
  }
255
- LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
256
+
257
+ LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
256
258
  float r = u_dist(rng);
257
- llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
259
+ llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
260
+
261
+ //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
262
+
258
263
  // acquire the token probabilities assigned by the draft and target models
259
264
  for (size_t i = 0; i < dist_tgt.size; i++) {
260
265
  if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
@@ -267,24 +272,23 @@ int main(int argc, char ** argv) {
267
272
  break;
268
273
  }
269
274
  }
270
- LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
275
+ LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
271
276
  if (r <= p_tgt / p_dft) {
272
277
  s_keep = s;
273
278
  accept = true;
274
279
  token_id = drafts[s].tokens[i_dft];
275
280
  token_str = llama_token_to_piece(ctx_tgt, token_id);
276
- llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
281
+ gpt_sampler_accept(smpl, token_id, true);
277
282
 
278
- LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
283
+ LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
279
284
  break;
280
285
  } else {
281
- LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
286
+ LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
282
287
  drafts[s].active = false;
283
288
 
284
289
  // calculate residual probability
285
290
  GGML_ASSERT(dist_tgt.sorted);
286
291
  GGML_ASSERT(dist_dft.sorted);
287
- float sum_probs = 0.0f;
288
292
 
289
293
  // sort dist by id
290
294
  std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
@@ -294,10 +298,18 @@ int main(int argc, char ** argv) {
294
298
  return a.id < b.id;
295
299
  });
296
300
 
301
+ float sum_probs = 0.0f;
302
+
297
303
  for (size_t i = 0; i < dist_tgt.size; i++) {
298
- dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
304
+ if (i < dist_dft.size) {
305
+ dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
306
+ } else {
307
+ dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p);
308
+ }
309
+
299
310
  sum_probs += dist_tgt.data[i].p;
300
311
  }
312
+
301
313
  for (size_t i = 0; i < dist_tgt.size; i++) {
302
314
  dist_tgt.data[i].p /= sum_probs;
303
315
  }
@@ -326,22 +338,28 @@ int main(int argc, char ** argv) {
326
338
  if (!accept) {
327
339
  // all drafted tokens were rejected
328
340
  // sample from the target model
329
- LOG("all drafted tokens were rejected, sampling from residual distribution\n");
330
- token_id = llama_sample_token(ctx_tgt, &dist_tgt);
331
- llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
341
+ LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
342
+ std::vector<float> probs(dist_tgt.size);
343
+ for (size_t i = 0; i < dist_tgt.size; ++i) {
344
+ probs[i] = dist_tgt.data[i].p;
345
+ }
346
+
347
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
348
+
349
+ const int idx = dist(rng);
350
+
351
+ token_id = dist_tgt.data[idx].id;
352
+ gpt_sampler_accept(smpl, token_id, true);
332
353
  token_str = llama_token_to_piece(ctx_tgt, token_id);
333
354
  }
334
-
335
355
  } else {
336
356
  // greedy verification
337
357
 
338
358
  // sample from the target model
339
- LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
340
- token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
359
+ LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
360
+ token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
341
361
 
342
- llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
343
-
344
- //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
362
+ gpt_sampler_accept(smpl, token_id, true);
345
363
 
346
364
  token_str = llama_token_to_piece(ctx_tgt, token_id);
347
365
 
@@ -351,7 +369,7 @@ int main(int argc, char ** argv) {
351
369
  }
352
370
 
353
371
  if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
354
- LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
372
+ LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
355
373
 
356
374
  s_keep = s;
357
375
  accept = true;
@@ -373,26 +391,24 @@ int main(int argc, char ** argv) {
373
391
  ++i_dft;
374
392
  if (params.use_color) {
375
393
  // Color token according to its origin sequence
376
- printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
394
+ LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
377
395
  } else {
378
- printf("%s", token_str.c_str());
396
+ LOG("%s", token_str.c_str());
379
397
  }
380
- fflush(stdout);
381
398
  continue;
382
399
  } else {
383
- printf("%s", token_str.c_str());
384
- fflush(stdout);
400
+ LOG("%s", token_str.c_str());
385
401
  break;
386
402
  }
387
403
  }
388
404
  }
389
405
 
390
406
  {
391
- LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
407
+ LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
392
408
 
393
409
  // TODO: simplify
394
410
  {
395
- LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
411
+ LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
396
412
 
397
413
  llama_kv_cache_seq_keep(ctx_dft, s_keep);
398
414
  llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
@@ -419,7 +435,7 @@ int main(int argc, char ** argv) {
419
435
  llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
420
436
 
421
437
  llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
422
- // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
438
+ // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
423
439
  llama_decode(ctx_dft, batch_dft);
424
440
 
425
441
  ++n_past_dft;
@@ -429,7 +445,10 @@ int main(int argc, char ** argv) {
429
445
  break;
430
446
  }
431
447
 
432
- llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
448
+ if (drafts[0].smpl) {
449
+ gpt_sampler_free(drafts[0].smpl);
450
+ }
451
+ drafts[0].smpl = gpt_sampler_clone(smpl);
433
452
 
434
453
  int n_seq_cur = 1;
435
454
  int n_past_cur = n_past_dft;
@@ -458,21 +477,21 @@ int main(int argc, char ** argv) {
458
477
  continue;
459
478
  }
460
479
 
461
- llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
480
+ gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
462
481
 
463
- const auto & cur_p = drafts[s].ctx_sampling->cur;
482
+ const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
464
483
 
465
- for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
466
- LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
467
- k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
484
+ for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
485
+ LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
486
+ k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
468
487
  }
469
488
 
470
489
  std::vector<int> sa(1, s);
471
490
 
472
491
  // attempt to split the branch if the probability is high enough
473
492
  for (int f = 1; f < 8; ++f) {
474
- if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
475
- LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
493
+ if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
494
+ LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
476
495
 
477
496
  llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
478
497
  llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
@@ -498,7 +517,10 @@ int main(int argc, char ** argv) {
498
517
  drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
499
518
  drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
500
519
 
501
- llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
520
+ if (drafts[n_seq_cur].smpl) {
521
+ gpt_sampler_free(drafts[n_seq_cur].smpl);
522
+ }
523
+ drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
502
524
 
503
525
  sa.push_back(n_seq_cur);
504
526
 
@@ -510,15 +532,15 @@ int main(int argc, char ** argv) {
510
532
 
511
533
  // add drafted token for each sequence
512
534
  for (int is = 0; is < (int) sa.size(); ++is) {
513
- const llama_token id = cur_p[is].id;
535
+ const llama_token id = cur_p->data[is].id;
514
536
 
515
537
  const int s = sa[is];
516
538
 
517
- llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
539
+ gpt_sampler_accept(drafts[s].smpl, id, true);
518
540
 
519
541
  drafts[s].tokens.push_back(id);
520
542
  // save cur_p.data into drafts[s].dists
521
- drafts[s].dists.push_back(cur_p);
543
+ drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
522
544
 
523
545
  // add unique drafted tokens to the target batch
524
546
  drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@@ -558,7 +580,7 @@ int main(int argc, char ** argv) {
558
580
  llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
559
581
  }
560
582
 
561
- // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
583
+ // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
562
584
  llama_decode(ctx_tgt, batch_tgt);
563
585
  ++n_past_tgt;
564
586
  }
@@ -576,29 +598,33 @@ int main(int argc, char ** argv) {
576
598
 
577
599
  auto t_dec_end = ggml_time_us();
578
600
 
579
- LOG_TEE("\n\n");
601
+ LOG("\n\n");
580
602
 
581
- LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
582
- LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
603
+ LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
604
+ LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
583
605
 
584
- LOG_TEE("\n");
585
- LOG_TEE("n_draft = %d\n", n_draft);
586
- LOG_TEE("n_predict = %d\n", n_predict);
587
- LOG_TEE("n_drafted = %d\n", n_drafted);
588
- LOG_TEE("n_accept = %d\n", n_accept);
589
- LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
606
+ LOG_INF("\n");
607
+ LOG_INF("n_draft = %d\n", n_draft);
608
+ LOG_INF("n_predict = %d\n", n_predict);
609
+ LOG_INF("n_drafted = %d\n", n_drafted);
610
+ LOG_INF("n_accept = %d\n", n_accept);
611
+ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
590
612
 
591
- LOG_TEE("\ndraft:\n");
592
- llama_print_timings(ctx_dft);
613
+ LOG_INF("\n");
614
+ LOG_INF("draft:\n\n");
615
+ // TODO: print sampling/grammar timings for all drafts
616
+ llama_perf_context_print(ctx_dft);
593
617
 
594
- LOG_TEE("\ntarget:\n");
595
- llama_print_timings(ctx_tgt);
618
+ LOG_INF("\n");
619
+ LOG_INF("target:\n\n");
620
+ gpt_perf_print(ctx_tgt, smpl);
596
621
 
597
- llama_sampling_free(ctx_sampling);
622
+ gpt_sampler_free(smpl);
598
623
  for (int s = 0; s < n_seq_dft; ++s) {
599
- llama_sampling_free(drafts[s].ctx_sampling);
624
+ gpt_sampler_free(drafts[s].smpl);
600
625
  }
601
626
 
627
+ llama_sampler_free(softmax);
602
628
  llama_batch_free(batch_dft);
603
629
 
604
630
  llama_free(ctx_tgt);
@@ -609,7 +635,7 @@ int main(int argc, char ** argv) {
609
635
 
610
636
  llama_backend_free();
611
637
 
612
- fprintf(stderr, "\n\n");
638
+ LOG("\n\n");
613
639
 
614
640
  return 0;
615
641
  }
@@ -4,33 +4,24 @@
4
4
  # Copyright (C) 2024 Intel Corporation
5
5
  # SPDX-License-Identifier: MIT
6
6
 
7
- INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
8
7
  source /opt/intel/oneapi/setvars.sh
9
8
 
10
- if [ $# -gt 0 ]; then
11
- GGML_SYCL_DEVICE=$1
12
- GGML_SYCL_SINGLE_GPU=1
13
- else
14
- GGML_SYCL_DEVICE=0
15
- GGML_SYCL_SINGLE_GPU=0
16
- fi
17
-
18
9
  #export GGML_SYCL_DEBUG=1
19
10
 
20
-
21
11
  #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
22
12
 
23
- if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
13
+ INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
14
+ MODEL_FILE=models/llama-2-7b.Q4_0.gguf
15
+ NGL=33
16
+ CONEXT=8192
17
+
18
+ if [ $# -gt 0 ]; then
19
+ GGML_SYCL_DEVICE=$1
24
20
  echo "use $GGML_SYCL_DEVICE as main GPU"
25
21
  #use signle GPU only
26
- ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
22
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
23
+
27
24
  else
28
25
  #use multiple GPUs with same max compute units
29
- ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
26
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
30
27
  fi
31
-
32
- #use main GPU only
33
- #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
34
-
35
- #use multiple GPUs with same max compute units
36
- #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
6
6
  @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
7
7
 
8
8
 
9
- .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
9
+ .\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
@@ -1,11 +1,13 @@
1
1
  #include "common.h"
2
+ //#include "log.h" // TODO: start using log.h
2
3
  #include "llama.h"
3
4
 
4
- #include <cmath>
5
5
  #include <cstdio>
6
+ #include <cstring>
6
7
  #include <fstream>
7
8
  #include <string>
8
9
  #include <vector>
10
+ #include <iostream> // TODO: remove me
9
11
 
10
12
  #if defined(_WIN32)
11
13
  #define WIN32_LEAN_AND_MEAN
@@ -13,25 +15,25 @@
13
15
  #include <shellapi.h> // For CommandLineToArgvW
14
16
  #endif
15
17
 
16
- static void print_usage_information(const char * argv0, FILE * stream) {
17
- fprintf(stream, "usage: %s [options]\n\n", argv0);
18
- fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
19
- fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
20
- fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
21
- fprintf(stream, "to control the behavior of the tokenizer.\n\n");
22
- fprintf(stream, " The possible options are:\n");
23
- fprintf(stream, "\n");
24
- fprintf(stream, " -h, --help print this help and exit\n");
25
- fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
26
- fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
27
- fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
28
- fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
29
- fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
30
- fprintf(stream, " --stdin read prompt from standard input.\n");
31
- fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
32
- fprintf(stream, " --no-parse-special do not parse control tokens.\n");
33
- fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
34
- fprintf(stream, " --show-count print the total number of tokens.\n");
18
+ static void print_usage_information(const char * argv0) {
19
+ printf("usage: %s [options]\n\n", argv0);
20
+ printf("The tokenize program tokenizes a prompt using a given model,\n");
21
+ printf("and prints the resulting tokens to standard output.\n\n");
22
+ printf("It needs a model file, a prompt, and optionally other flags\n");
23
+ printf("to control the behavior of the tokenizer.\n\n");
24
+ printf(" The possible options are:\n");
25
+ printf("\n");
26
+ printf(" -h, --help print this help and exit\n");
27
+ printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
28
+ printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
29
+ printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
30
+ printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
31
+ printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
32
+ printf(" --stdin read prompt from standard input.\n");
33
+ printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
34
+ printf(" --no-parse-special do not parse control tokens.\n");
35
+ printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
36
+ printf(" --show-count print the total number of tokens.\n");
35
37
  }
36
38
 
37
39
  static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
185
187
  const int argc = argv.size();
186
188
 
187
189
  if (argc <= 1) {
188
- print_usage_information(argv[0].c_str(), stderr);
190
+ print_usage_information(argv[0].c_str());
189
191
  return 1;
190
192
  }
191
193
 
@@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
214
216
  for (; iarg < argc; ++iarg) {
215
217
  std::string arg{argv[iarg]};
216
218
  if (arg == "-h" || arg == "--help") {
217
- print_usage_information(argv[0].c_str(), stdout);
219
+ print_usage_information(argv[0].c_str());
218
220
  return 0;
219
221
  }
220
222
  else if (arg == "--ids") {
@@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
323
325
  // Start actually doing the tokenizing stuff.
324
326
  //////
325
327
 
326
- #ifdef LOG_DISABLE_LOGS
327
- disable_logging = true;
328
- #endif
329
-
330
328
  if (disable_logging) {
331
329
  llama_log_set(llama_log_callback_null, NULL);
332
330
  }
@@ -362,7 +360,7 @@ int main(int raw_argc, char ** raw_argv) {
362
360
  prompt = stdin_buffer.str();
363
361
  }
364
362
 
365
- const bool model_wants_add_bos = llama_should_add_bos_token(model);
363
+ const bool model_wants_add_bos = llama_add_bos_token(model);
366
364
  const bool add_bos = model_wants_add_bos && !no_bos;
367
365
  const bool parse_special = !no_parse_special;
368
366