@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -10,181 +10,199 @@
10
10
  #include <string>
11
11
  #include <vector>
12
12
 
13
- static void dump(const llama_token_data_array * candidates) {
14
- for (size_t i = 0; i < candidates->size; i++) {
15
- printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
13
+ static void dump(const llama_token_data_array * cur_p) {
14
+ for (size_t i = 0; i < cur_p->size; i++) {
15
+ printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
16
16
  }
17
17
  }
18
18
 
19
- #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
19
+ #define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
20
+
21
+ #define APPLY(__cnstr, __cur_p) do { \
22
+ auto * cnstr = (__cnstr); \
23
+ llama_sampler_apply(cnstr, (__cur_p)); \
24
+ llama_sampler_free(cnstr); \
25
+ } while(0)
20
26
 
21
27
  static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
22
28
  const size_t n_vocab = probs.size();
23
- std::vector<llama_token_data> candidates;
24
- candidates.reserve(n_vocab);
29
+
30
+ std::vector<llama_token_data> cur;
31
+ cur.reserve(n_vocab);
25
32
  for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
26
33
  const float logit = logf(probs[token_id]);
27
- candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
34
+ cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
28
35
  }
29
36
 
30
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
31
- llama_sample_softmax(nullptr, &candidates_p);
32
- DUMP(&candidates_p);
33
- llama_sample_top_k(nullptr, &candidates_p, k, 1);
34
- DUMP(&candidates_p);
37
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
38
+ APPLY(llama_sampler_init_softmax(), &cur_p);
39
+ DUMP(&cur_p);
40
+ APPLY(llama_sampler_init_top_k(k), &cur_p);
41
+ DUMP(&cur_p);
35
42
 
36
- GGML_ASSERT(candidates_p.size == expected_probs.size());
37
- for (size_t i = 0; i < candidates_p.size; i++) {
38
- GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
43
+ GGML_ASSERT(cur_p.size == expected_probs.size());
44
+ for (size_t i = 0; i < cur_p.size; i++) {
45
+ GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
39
46
  }
40
47
  }
41
48
 
42
49
  static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
43
50
  const size_t n_vocab = probs.size();
44
- std::vector<llama_token_data> candidates;
45
- candidates.reserve(n_vocab);
51
+
52
+ std::vector<llama_token_data> cur;
53
+ cur.reserve(n_vocab);
46
54
  for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
47
55
  const float logit = logf(probs[token_id]);
48
- candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
56
+ cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
49
57
  }
50
58
 
51
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
52
- llama_sample_softmax(nullptr, &candidates_p);
53
- DUMP(&candidates_p);
54
- llama_sample_top_p(nullptr, &candidates_p, p, 1);
55
- DUMP(&candidates_p);
59
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
60
+ APPLY(llama_sampler_init_softmax(), &cur_p);
61
+ DUMP(&cur_p);
62
+ APPLY(llama_sampler_init_top_p(p, 1), &cur_p);
63
+ DUMP(&cur_p);
56
64
 
57
- GGML_ASSERT(candidates_p.size == expected_probs.size());
58
- for (size_t i = 0; i < candidates_p.size; i++) {
59
- GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
65
+ GGML_ASSERT(cur_p.size == expected_probs.size());
66
+ for (size_t i = 0; i < cur_p.size; i++) {
67
+ GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
60
68
  }
61
69
  }
62
70
 
63
71
  static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
64
72
  const size_t n_vocab = probs.size();
65
- std::vector<llama_token_data> candidates;
66
- candidates.reserve(n_vocab);
73
+
74
+ std::vector<llama_token_data> cur;
75
+ cur.reserve(n_vocab);
67
76
  for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
68
77
  const float logit = logf(probs[token_id]);
69
- candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
78
+ cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
70
79
  }
71
80
 
72
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
73
- DUMP(&candidates_p);
74
- llama_sample_tail_free(nullptr, &candidates_p, z, 1);
75
- DUMP(&candidates_p);
81
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
82
+ DUMP(&cur_p);
83
+ APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
84
+ DUMP(&cur_p);
76
85
 
77
- GGML_ASSERT(candidates_p.size == expected_probs.size());
78
- for (size_t i = 0; i < candidates_p.size; i++) {
79
- GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
86
+ GGML_ASSERT(cur_p.size == expected_probs.size());
87
+ for (size_t i = 0; i < cur_p.size; i++) {
88
+ GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
80
89
  }
81
90
  }
82
91
 
83
92
  static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
84
93
  const size_t n_vocab = probs.size();
85
- std::vector<llama_token_data> candidates;
86
- candidates.reserve(n_vocab);
94
+
95
+ std::vector<llama_token_data> cur;
96
+ cur.reserve(n_vocab);
87
97
  for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
88
98
  const float logit = logf(probs[token_id]);
89
- candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
99
+ cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
90
100
  }
91
101
 
92
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
93
- DUMP(&candidates_p);
94
- llama_sample_min_p(nullptr, &candidates_p, p, 1);
95
- DUMP(&candidates_p);
96
- llama_sample_softmax(nullptr, &candidates_p);
102
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
103
+ DUMP(&cur_p);
104
+ APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
105
+ DUMP(&cur_p);
106
+ APPLY(llama_sampler_init_softmax(), &cur_p);
97
107
 
98
- GGML_ASSERT(candidates_p.size == expected_probs.size());
99
- for (size_t i = 0; i < candidates_p.size; i++) {
100
- GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
108
+ GGML_ASSERT(cur_p.size == expected_probs.size());
109
+ for (size_t i = 0; i < cur_p.size; i++) {
110
+ GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
101
111
  }
102
112
  }
103
113
 
104
114
  static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
105
115
  const size_t n_vocab = probs.size();
106
- std::vector<llama_token_data> candidates;
107
- candidates.reserve(n_vocab);
116
+
117
+ std::vector<llama_token_data> cur;
118
+ cur.reserve(n_vocab);
108
119
  for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
109
120
  const float logit = logf(probs[token_id]);
110
- candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
121
+ cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
111
122
  }
112
123
 
113
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
114
- DUMP(&candidates_p);
115
- llama_sample_typical(nullptr, &candidates_p, p, 1);
116
- DUMP(&candidates_p);
124
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
125
+ DUMP(&cur_p);
126
+ APPLY(llama_sampler_init_typical(p, 1), &cur_p);
127
+ DUMP(&cur_p);
117
128
 
118
- GGML_ASSERT(candidates_p.size == expected_probs.size());
119
- for (size_t i = 0; i < candidates_p.size; i++) {
120
- GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
129
+ GGML_ASSERT(cur_p.size == expected_probs.size());
130
+ for (size_t i = 0; i < cur_p.size; i++) {
131
+ GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
121
132
  }
122
133
  }
123
134
 
124
- static void test_repetition_penalties(
135
+ static void test_penalties(
125
136
  const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
126
137
  const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
127
138
  ) {
128
139
  GGML_ASSERT(probs.size() == expected_probs.size());
129
140
 
130
141
  const size_t n_vocab = probs.size();
131
- std::vector<llama_token_data> candidates;
132
- candidates.reserve(n_vocab);
142
+
143
+ std::vector<llama_token_data> cur;
144
+ cur.reserve(n_vocab);
133
145
  for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
134
146
  const float logit = logf(probs[token_id]);
135
- candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
147
+ cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
148
+ }
149
+
150
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
151
+
152
+ auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
153
+
154
+ for (size_t i = 0; i < last_tokens.size(); i++) {
155
+ llama_sampler_accept(sampler, last_tokens[i]);
136
156
  }
137
157
 
138
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
139
- llama_sample_softmax(nullptr, &candidates_p);
140
- DUMP(&candidates_p);
141
- llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
142
- llama_sample_softmax(nullptr, &candidates_p);
143
- DUMP(&candidates_p);
158
+ APPLY(llama_sampler_init_softmax(), &cur_p);
159
+ DUMP(&cur_p);
160
+ APPLY(sampler, &cur_p);
161
+ APPLY(llama_sampler_init_softmax(), &cur_p);
162
+ DUMP(&cur_p);
144
163
 
145
- GGML_ASSERT(candidates_p.size == expected_probs.size());
146
- for (size_t i = 0; i < candidates_p.size; i++) {
147
- GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
164
+ GGML_ASSERT(cur_p.size == expected_probs.size());
165
+ for (size_t i = 0; i < cur_p.size; i++) {
166
+ GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
148
167
  }
149
168
  }
150
169
 
151
- static void test_sampler_queue(
152
- const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
170
+ static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
153
171
  ) {
154
- std::vector<llama_token_data> candidates;
155
- candidates.reserve(n_vocab);
172
+ std::vector<llama_token_data> cur;
173
+ cur.reserve(n_vocab);
156
174
  for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
157
175
  const float logit = logf(token_id);
158
- candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
176
+ cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
159
177
  }
160
178
 
161
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
179
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
162
180
 
163
181
  llama_token min_token_id = 0;
164
182
  const llama_token max_token_id = n_vocab-1;
165
183
 
166
184
  for (auto s : samplers_sequence) {
167
185
  switch (s){
168
- case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
169
- case 'f': GGML_ABORT("tail_free test not implemented"); break;
170
- case 'y': GGML_ABORT("typical test not implemented"); break;
171
- case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
172
- case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
173
- case 't': GGML_ABORT("temperature test not implemented"); break;
174
- default : GGML_ABORT("Unknown sampler"); break;
186
+ case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break;
187
+ case 'f': GGML_ABORT("tail_free test not implemented");
188
+ case 'y': GGML_ABORT("typical test not implemented");
189
+ case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break;
190
+ case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break;
191
+ case 't': GGML_ABORT("temperature test not implemented");
192
+ default : GGML_ABORT("Unknown sampler");
175
193
  }
176
194
 
177
- llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
195
+ APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests
178
196
 
179
- const int size = candidates_p.size;
197
+ const int size = cur_p.size;
180
198
 
181
199
  if (s == 'k') {
182
200
  const int expected_size = std::min(size, top_k);
183
201
  min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
184
202
 
185
203
  GGML_ASSERT(size == expected_size);
186
- GGML_ASSERT(candidates_p.data[0].id == max_token_id);
187
- GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
204
+ GGML_ASSERT(cur_p.data[0].id == max_token_id);
205
+ GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
188
206
  } else if (s == 'p') {
189
207
  const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
190
208
  const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
@@ -206,8 +224,8 @@ static void test_sampler_queue(
206
224
  }
207
225
 
208
226
  GGML_ASSERT(size == expected_size);
209
- GGML_ASSERT(candidates_p.data[0].id == max_token_id);
210
- GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
227
+ GGML_ASSERT(cur_p.data[0].id == max_token_id);
228
+ GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
211
229
  } else if (s == 'm') {
212
230
  int expected_size = ceilf((1.0f-min_p) * n_vocab);
213
231
  expected_size = std::max(expected_size, 1);
@@ -219,17 +237,56 @@ static void test_sampler_queue(
219
237
  min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
220
238
 
221
239
  GGML_ASSERT(size == expected_size);
222
- GGML_ASSERT(candidates_p.data[0].id == max_token_id);
223
- GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
240
+ GGML_ASSERT(cur_p.data[0].id == max_token_id);
241
+ GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
224
242
  } else {
225
243
  GGML_ABORT("fatal error");
226
244
  }
227
245
  }
228
246
 
229
- printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
247
+ printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
230
248
  samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
231
249
  }
232
250
 
251
+ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
252
+ std::vector<llama_token_data> cur(data.size());
253
+ std::copy(data.begin(), data.end(), cur.begin());
254
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
255
+ llama_sampler_apply(cnstr, &cur_p);
256
+ llama_sampler_reset(cnstr);
257
+ const int64_t t_start = ggml_time_us();
258
+ for (int i = 0; i < n_iter; i++) {
259
+ std::copy(data.begin(), data.end(), cur.begin());
260
+ llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
261
+ llama_sampler_apply(cnstr, &cur_p);
262
+ llama_sampler_reset(cnstr);
263
+ }
264
+ const int64_t t_end = ggml_time_us();
265
+ llama_sampler_free(cnstr);
266
+ printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
267
+ }
268
+
269
+ #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
270
+
271
+ static void test_perf() {
272
+ const int n_vocab = 1 << 17;
273
+
274
+ std::vector<llama_token_data> data;
275
+
276
+ data.reserve(n_vocab);
277
+ for (int i = 0; i < n_vocab; i++) {
278
+ const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f);
279
+ data.emplace_back(llama_token_data{i, logit, 0.0f});
280
+ }
281
+
282
+ BENCH(llama_sampler_init_top_k (40), data, 32);
283
+ BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
284
+ BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
285
+ BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
286
+ BENCH(llama_sampler_init_typical (0.5f, 1), data, 32);
287
+ BENCH(llama_sampler_init_softmax (), data, 32);
288
+ }
289
+
233
290
  int main(void) {
234
291
  ggml_time_init();
235
292
 
@@ -259,13 +316,13 @@ int main(void) {
259
316
  test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
260
317
  test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
261
318
 
262
- test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f);
263
- test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
264
- test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
319
+ test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f);
320
+ test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
321
+ test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
265
322
 
266
- test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
267
- test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
268
- test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
323
+ test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
324
+ test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
325
+ test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
269
326
 
270
327
  test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
271
328
  test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);
@@ -297,5 +354,7 @@ int main(void) {
297
354
 
298
355
  printf("OK\n");
299
356
 
357
+ test_perf();
358
+
300
359
  return 0;
301
360
  }
@@ -7,6 +7,7 @@
7
7
  #include <map>
8
8
  #include <vector>
9
9
  #include <fstream>
10
+ #include <thread>
10
11
 
11
12
  //static const std::map<std::string, std::vector<llama_token>> & k_tests() {
12
13
  // static std::map<std::string, std::vector<llama_token>> _k_tests = {
@@ -194,45 +195,64 @@ int main(int argc, char **argv) {
194
195
 
195
196
  const bool add_special = false;
196
197
 
197
- for (const auto & test_kv : k_tests) {
198
- const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
199
-
200
- printf("\n");
201
- printf("src: '%s'\n", test_kv.first.c_str());
202
- printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
203
- printf("tok: ");
204
- for (const auto & tok : res) {
205
- printf("%d ", tok);
206
- }
207
- printf("\n");
208
-
209
- bool correct = res.size() == test_kv.second.size();
210
- for (int i = 0; i < (int) res.size() && correct; ++i) {
211
- if (test_kv.second[i] != res[i]) {
212
- correct = false;
198
+ // multi-threaded tokenization
199
+ const int nthread = std::thread::hardware_concurrency();
200
+ std::vector<std::thread> threads(nthread);
201
+
202
+ for (int i = 0; i < nthread; i++) {
203
+ threads[i] = std::thread([&, i]() {
204
+ for (const auto & test_kv : k_tests) {
205
+ const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
206
+
207
+ // here only print the result of the first thread
208
+ // because the other threads are running the same tests
209
+ if (i != 0) {
210
+ continue;
211
+ }
212
+
213
+ printf("\n");
214
+ printf("src: '%s'\n", test_kv.first.c_str());
215
+ printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
216
+ printf("tok: ");
217
+ for (const auto & tok : res) {
218
+ printf("%d ", tok);
219
+ }
220
+ printf("\n");
221
+
222
+ bool correct = res.size() == test_kv.second.size();
223
+ for (int i = 0; i < (int) res.size() && correct; ++i) {
224
+ if (test_kv.second[i] != res[i]) {
225
+ correct = false;
226
+ }
227
+ }
228
+
229
+ if (!correct) {
230
+ fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
231
+ fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
232
+ llama_detokenize(ctx, res).c_str(),
233
+ llama_detokenize(ctx, test_kv.second).c_str());
234
+ fprintf(stderr, "%s : expected tokens: ", __func__);
235
+ for (const auto & t : test_kv.second) {
236
+ fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
237
+ }
238
+ fprintf(stderr, "\n");
239
+ fprintf(stderr, "%s : got tokens: ", __func__);
240
+ for (const auto & t : res) {
241
+ fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
242
+ }
243
+ fprintf(stderr, "\n");
244
+
245
+ success = false;
246
+ }
213
247
  }
214
- }
215
-
216
- if (!correct) {
217
- fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
218
- fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
219
- llama_detokenize(ctx, res).c_str(),
220
- llama_detokenize(ctx, test_kv.second).c_str());
221
- fprintf(stderr, "%s : expected tokens: ", __func__);
222
- for (const auto & t : test_kv.second) {
223
- fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
224
- }
225
- fprintf(stderr, "\n");
226
- fprintf(stderr, "%s : got tokens: ", __func__);
227
- for (const auto & t : res) {
228
- fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
229
- }
230
- fprintf(stderr, "\n");
248
+ });
249
+ }
231
250
 
232
- success = false;
233
- }
251
+ for (int i = 0; i < nthread; i++) {
252
+ threads[i].join();
234
253
  }
235
254
 
255
+ // single threaded tokenization
236
256
  if (!fname_text.empty()) {
237
257
  fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
238
258
 
@@ -1,22 +0,0 @@
1
- diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
2
- index fa68360b..f9ff7b5d 100644
3
- --- a/ggml/src/ggml-vulkan.cpp
4
- +++ b/ggml/src/ggml-vulkan.cpp
5
- @@ -617,9 +617,15 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
6
- vk::PipelineCreateFlags(),
7
- pipeline_shader_create_info,
8
- pipeline->layout);
9
- - pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
10
-
11
- - device->pipelines.push_back(pipeline);
12
- + try {
13
- + pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
14
- + device->pipelines.push_back(pipeline);
15
- + } catch(vk::UnknownError const&) {
16
- + VK_LOG_DEBUG("Failed to create pipeline " << name);
17
- + ggml_vk_destroy_pipeline(device->device, pipeline);
18
- + pipeline.reset();
19
- + }
20
- }
21
-
22
- static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {