@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -15,16 +15,16 @@ static void print_usage(int, char ** argv) {
15
15
  }
16
16
 
17
17
  int main(int argc, char ** argv) {
18
- gpt_params params;
18
+ common_params params;
19
19
 
20
20
  params.prompt = "Hello my name is";
21
21
  params.n_predict = 32;
22
22
 
23
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
23
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
24
24
  return 1;
25
25
  }
26
26
 
27
- gpt_init();
27
+ common_init();
28
28
 
29
29
  // number of parallel batches
30
30
  int n_parallel = params.n_parallel;
@@ -39,7 +39,7 @@ int main(int argc, char ** argv) {
39
39
 
40
40
  // initialize the model
41
41
 
42
- llama_model_params model_params = llama_model_params_from_gpt_params(params);
42
+ llama_model_params model_params = common_model_params_to_llama(params);
43
43
 
44
44
  llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
45
45
 
@@ -51,13 +51,13 @@ int main(int argc, char ** argv) {
51
51
  // tokenize the prompt
52
52
 
53
53
  std::vector<llama_token> tokens_list;
54
- tokens_list = ::llama_tokenize(model, params.prompt, true);
54
+ tokens_list = common_tokenize(model, params.prompt, true);
55
55
 
56
56
  const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
57
57
 
58
58
  // initialize the context
59
59
 
60
- llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
60
+ llama_context_params ctx_params = common_context_params_to_llama(params);
61
61
 
62
62
  ctx_params.n_ctx = n_kv_req;
63
63
  ctx_params.n_batch = std::max(n_predict, n_parallel);
@@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
94
94
  LOG("\n");
95
95
 
96
96
  for (auto id : tokens_list) {
97
- LOG("%s", llama_token_to_piece(ctx, id).c_str());
97
+ LOG("%s", common_token_to_piece(ctx, id).c_str());
98
98
  }
99
99
 
100
100
  // create a llama_batch
@@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
108
108
 
109
109
  // evaluate the initial prompt
110
110
  for (size_t i = 0; i < tokens_list.size(); ++i) {
111
- llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
111
+ common_batch_add(batch, tokens_list[i], i, seq_ids, false);
112
112
  }
113
113
  GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
114
114
 
@@ -123,8 +123,8 @@ int main(int argc, char ** argv) {
123
123
  decoder_start_token_id = llama_token_bos(model);
124
124
  }
125
125
 
126
- llama_batch_clear(batch);
127
- llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
126
+ common_batch_clear(batch);
127
+ common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
128
128
  }
129
129
 
130
130
  // llama_decode will output logits only for the last token of the prompt
@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
161
161
 
162
162
  while (n_cur <= n_predict) {
163
163
  // prepare the next batch
164
- llama_batch_clear(batch);
164
+ common_batch_clear(batch);
165
165
 
166
166
  // sample the next token for each parallel sequence / stream
167
167
  for (int32_t i = 0; i < n_parallel; ++i) {
@@ -185,15 +185,15 @@ int main(int argc, char ** argv) {
185
185
 
186
186
  // if there is only one stream, we print immediately to stdout
187
187
  if (n_parallel == 1) {
188
- LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
188
+ LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
189
189
  }
190
190
 
191
- streams[i] += llama_token_to_piece(ctx, new_token_id);
191
+ streams[i] += common_token_to_piece(ctx, new_token_id);
192
192
 
193
193
  i_batch[i] = batch.n_tokens;
194
194
 
195
195
  // push this new token for next evaluation
196
- llama_batch_add(batch, new_token_id, n_cur, { i }, true);
196
+ common_batch_add(batch, new_token_id, n_cur, { i }, true);
197
197
 
198
198
  n_decode += 1;
199
199
  }
@@ -15,13 +15,13 @@ static void print_usage(int, char ** argv) {
15
15
  }
16
16
 
17
17
  int main(int argc, char ** argv) {
18
- gpt_params params;
18
+ common_params params;
19
19
 
20
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
20
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
21
21
  return 1;
22
22
  }
23
23
 
24
- gpt_init();
24
+ common_init();
25
25
 
26
26
  int is_pp_shared = params.is_pp_shared;
27
27
 
@@ -36,7 +36,7 @@ int main(int argc, char ** argv) {
36
36
 
37
37
  // initialize the model
38
38
 
39
- llama_model_params model_params = llama_model_params_from_gpt_params(params);
39
+ llama_model_params model_params = common_model_params_to_llama(params);
40
40
 
41
41
  llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
42
42
 
@@ -45,7 +45,7 @@ int main(int argc, char ** argv) {
45
45
  return 1;
46
46
  }
47
47
 
48
- llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
48
+ llama_context_params ctx_params = common_context_params_to_llama(params);
49
49
 
50
50
  // ensure enough sequences are available
51
51
  ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
74
74
  batch.n_seq_id + i,
75
75
  batch.seq_id + i,
76
76
  batch.logits + i,
77
- 0, 0, 0, // unused
78
77
  };
79
78
 
80
79
  const int ret = llama_decode(ctx, batch_view);
@@ -92,7 +91,7 @@ int main(int argc, char ** argv) {
92
91
  // warm up
93
92
  {
94
93
  for (int i = 0; i < 16; ++i) {
95
- llama_batch_add(batch, 0, i, { 0 }, false);
94
+ common_batch_add(batch, 0, i, { 0 }, false);
96
95
  }
97
96
 
98
97
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -122,11 +121,11 @@ int main(int argc, char ** argv) {
122
121
  continue;
123
122
  }
124
123
 
125
- llama_batch_clear(batch);
124
+ common_batch_clear(batch);
126
125
 
127
126
  for (int i = 0; i < pp; ++i) {
128
127
  for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
129
- llama_batch_add(batch, 0, i, { j }, false);
128
+ common_batch_add(batch, 0, i, { j }, false);
130
129
  }
131
130
  }
132
131
  batch.logits[batch.n_tokens - 1] = true;
@@ -151,10 +150,10 @@ int main(int argc, char ** argv) {
151
150
  const auto t_tg_start = ggml_time_us();
152
151
 
153
152
  for (int i = 0; i < tg; ++i) {
154
- llama_batch_clear(batch);
153
+ common_batch_clear(batch);
155
154
 
156
155
  for (int j = 0; j < pl; ++j) {
157
- llama_batch_add(batch, 0, pp + i, { j }, true);
156
+ common_batch_add(batch, 0, pp + i, { j }, true);
158
157
  }
159
158
 
160
159
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -872,7 +872,7 @@ static std::string basename(const std::string &path) {
872
872
  }
873
873
 
874
874
  int main(int argc, char ** argv) {
875
- gpt_init();
875
+ common_init();
876
876
 
877
877
  struct train_params params = get_default_train_params();
878
878
  if (!params_parse(argc, argv, &params)) {
@@ -31,7 +31,7 @@ template <class Iter>
31
31
  static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
32
32
  std::string ret;
33
33
  for (; begin != end; ++begin) {
34
- ret += llama_token_to_piece(ctx, *begin);
34
+ ret += common_token_to_piece(ctx, *begin);
35
35
  }
36
36
 
37
37
  return ret;
@@ -272,8 +272,8 @@ struct tokenized_prompt {
272
272
 
273
273
  tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
274
274
  const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
275
- tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
276
- tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
275
+ tokens_pos = common_tokenize(ctx, pos, add_bos, true);
276
+ tokens_neg = common_tokenize(ctx, neg, add_bos, true);
277
277
  max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
278
278
  padding_seq(ctx, tokens_pos, max_seq_len);
279
279
  padding_seq(ctx, tokens_neg, max_seq_len);
@@ -281,7 +281,7 @@ struct tokenized_prompt {
281
281
 
282
282
  void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
283
283
  // TODO: customize padding token
284
- std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
284
+ std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
285
285
  llama_token pad_tok = pad_tokens.back();
286
286
  while (tokens.size() < len) {
287
287
  tokens.push_back(pad_tok);
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
339
339
 
340
340
  static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
341
341
  llama_kv_cache_clear(ctx);
342
- if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
342
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
343
343
  fprintf(stderr, "%s : failed to eval\n", __func__);
344
344
  return false;
345
345
  }
@@ -370,7 +370,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
370
370
  * Load prompt files and completion file.
371
371
  * Then format each pair of prompt + completion to make an entry.
372
372
  */
373
- static int prepare_entries(gpt_params & params, train_context & ctx_train) {
373
+ static int prepare_entries(common_params & params, train_context & ctx_train) {
374
374
  // load prompts
375
375
  std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
376
376
  std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
@@ -388,9 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
388
388
  }
389
389
 
390
390
  int main(int argc, char ** argv) {
391
- gpt_params params;
391
+ common_params params;
392
392
 
393
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
393
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
394
394
  return 1;
395
395
  }
396
396
 
@@ -413,7 +413,7 @@ int main(int argc, char ** argv) {
413
413
  llama_numa_init(params.numa);
414
414
 
415
415
  // load the model to get hparams
416
- llama_init_result llama_init = llama_init_from_gpt_params(params);
416
+ common_init_result llama_init = common_init_from_params(params);
417
417
 
418
418
  llama_model * model = llama_init.model;
419
419
  llama_context * ctx = llama_init.context;
@@ -28,7 +28,7 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
28
28
  static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
29
29
  size_t n_tokens = tokens.size();
30
30
  for (size_t i = 0; i < n_tokens; i++) {
31
- llama_batch_add(batch, tokens[i], i, { seq_id }, true);
31
+ common_batch_add(batch, tokens[i], i, { seq_id }, true);
32
32
  }
33
33
  }
34
34
 
@@ -74,18 +74,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
74
74
  }
75
75
 
76
76
  float * out = output + embd_pos * n_embd;
77
- llama_embd_normalize(embd, out, n_embd, embd_norm);
77
+ common_embd_normalize(embd, out, n_embd, embd_norm);
78
78
  }
79
79
  }
80
80
 
81
81
  int main(int argc, char ** argv) {
82
- gpt_params params;
82
+ common_params params;
83
83
 
84
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
84
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
85
85
  return 1;
86
86
  }
87
87
 
88
- gpt_init();
88
+ common_init();
89
89
 
90
90
  params.embedding = true;
91
91
  // For non-causal models, batch size must be equal to ubatch size
@@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
95
95
  llama_numa_init(params.numa);
96
96
 
97
97
  // load the model
98
- llama_init_result llama_init = llama_init_from_gpt_params(params);
98
+ common_init_result llama_init = common_init_from_params(params);
99
99
 
100
100
  llama_model * model = llama_init.model;
101
101
  llama_context * ctx = llama_init.context;
@@ -122,7 +122,7 @@ int main(int argc, char ** argv) {
122
122
  // print system information
123
123
  {
124
124
  LOG_INF("\n");
125
- LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
125
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
126
126
  }
127
127
 
128
128
  // split the prompt into lines
@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
135
135
  // tokenize the prompts and trim
136
136
  std::vector<std::vector<int32_t>> inputs;
137
137
  for (const auto & prompt : prompts) {
138
- auto inp = ::llama_tokenize(ctx, prompt, true, true);
138
+ auto inp = common_tokenize(ctx, prompt, true, true);
139
139
  if (inp.size() > n_batch) {
140
140
  LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
141
141
  __func__, (long long int) inp.size(), (long long int) n_batch);
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
159
159
  LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
160
160
  LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
161
161
  for (int j = 0; j < (int) inputs[i].size(); j++) {
162
- LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
162
+ LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
163
163
  }
164
164
  LOG("\n\n");
165
165
  }
@@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
199
199
  batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
200
200
  e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
201
201
  s = 0;
202
- llama_batch_clear(batch);
202
+ common_batch_clear(batch);
203
203
  }
204
204
 
205
205
  // add to batch
@@ -263,7 +263,7 @@ int main(int argc, char ** argv) {
263
263
  LOG("\n");
264
264
  for (int i = 0; i < n_prompts; i++) {
265
265
  for (int j = 0; j < n_prompts; j++) {
266
- float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
266
+ float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
267
267
  LOG("%6.2f ", sim);
268
268
  }
269
269
  LOG("%1.10s", prompts[i].c_str());
@@ -296,7 +296,7 @@ int main(int argc, char ** argv) {
296
296
  for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
297
297
  LOG(" [");
298
298
  for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
299
- float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
299
+ float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
300
300
  LOG("%6.2f", sim);
301
301
  j++;
302
302
  if (j < n_embd_count) LOG(", "); else break;
@@ -126,12 +126,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
126
126
  return true;
127
127
  }
128
128
 
129
- static bool run(llama_context * ctx, const gpt_params & params) {
129
+ static bool run(llama_context * ctx, const common_params & params) {
130
130
  const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
131
131
 
132
- std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
132
+ std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
133
133
 
134
- if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
134
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
135
135
  LOG_ERR("%s : failed to eval\n", __func__);
136
136
  return false;
137
137
  }
@@ -142,13 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) {
142
142
  int main(int argc, char ** argv) {
143
143
  callback_data cb_data;
144
144
 
145
- gpt_params params;
145
+ common_params params;
146
146
 
147
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
147
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
148
148
  return 1;
149
149
  }
150
150
 
151
- gpt_init();
151
+ common_init();
152
152
 
153
153
  llama_backend_init();
154
154
  llama_numa_init(params.numa);
@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
160
160
  params.warmup = false;
161
161
 
162
162
  // init
163
- llama_init_result llama_init = llama_init_from_gpt_params(params);
163
+ common_init_result llama_init = common_init_from_params(params);
164
164
 
165
165
  llama_model * model = llama_init.model;
166
166
  llama_context * ctx = llama_init.context;
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
172
172
  // print system information
173
173
  {
174
174
  LOG_INF("\n");
175
- LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
175
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
176
176
  LOG_INF("\n");
177
177
  }
178
178
 
@@ -128,7 +128,7 @@ struct lora_merge_ctx {
128
128
 
129
129
  lora_merge_ctx(
130
130
  std::string & base_fname,
131
- std::vector<llama_lora_adapter_info> & lora_files,
131
+ std::vector<common_lora_adapter_info> & lora_files,
132
132
  std::string & outfile,
133
133
  int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
134
134
  fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@@ -314,9 +314,9 @@ struct lora_merge_ctx {
314
314
  // optionally dequantize it
315
315
  printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
316
316
  auto nels = ggml_nelements(inp_base);
317
- ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
317
+ const auto * qtype = ggml_get_type_traits(base->type);
318
318
  std::vector<uint8_t> dequant_buf(nels * sizeof(float));
319
- qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
319
+ qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
320
320
  ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
321
321
  } else {
322
322
  ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
@@ -400,9 +400,9 @@ static void print_usage(int, char ** argv) {
400
400
  }
401
401
 
402
402
  int main(int argc, char ** argv) {
403
- gpt_params params;
403
+ common_params params;
404
404
 
405
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
405
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
406
406
  return 1;
407
407
  }
408
408
 
@@ -11,7 +11,7 @@ static void write_table_header(std::ofstream & file) {
11
11
  file << "| -------- | ----------- |\n";
12
12
  }
13
13
 
14
- static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
14
+ static void write_table_entry(std::ofstream & file, const common_arg & opt) {
15
15
  file << "| `";
16
16
  // args
17
17
  for (const auto & arg : opt.args) {
@@ -40,7 +40,7 @@ static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
40
40
  file << "` | " << md_help << " |\n";
41
41
  }
42
42
 
43
- static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
43
+ static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
44
44
  write_table_header(file);
45
45
  for (const auto & opt : opts) {
46
46
  write_table_entry(file, *opt);
@@ -50,12 +50,12 @@ static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
50
50
  static void export_md(std::string fname, llama_example ex) {
51
51
  std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
52
52
 
53
- gpt_params params;
54
- auto ctx_arg = gpt_params_parser_init(params, ex);
53
+ common_params params;
54
+ auto ctx_arg = common_params_parser_init(params, ex);
55
55
 
56
- std::vector<llama_arg *> common_options;
57
- std::vector<llama_arg *> sparam_options;
58
- std::vector<llama_arg *> specific_options;
56
+ std::vector<common_arg *> common_options;
57
+ std::vector<common_arg *> sparam_options;
58
+ std::vector<common_arg *> specific_options;
59
59
  for (auto & opt : ctx_arg.options) {
60
60
  // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
61
61
  if (opt.is_sparam) {
@@ -15,11 +15,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
15
15
  llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
16
16
 
17
17
  for (uint64_t i = 0; i < sentences.size(); i++) {
18
- llama_batch_clear(batch);
18
+ common_batch_clear(batch);
19
19
 
20
20
  const std::string input_string = instruction + sentences[i];
21
21
 
22
- std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
22
+ std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
23
23
 
24
24
  const int32_t n_toks = inputs.size();
25
25
 
@@ -28,7 +28,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
28
28
  // inputs.push_back(llama_token_eos(model));
29
29
 
30
30
  // we want to ignore instruction tokens for mean pooling
31
- const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
31
+ const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
32
32
 
33
33
  #ifdef GRIT_DEBUG
34
34
  // debug tokens - should be matching as referenced in the GritLM sample
@@ -40,7 +40,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
40
40
 
41
41
  // add input to batch (this increments n_tokens)
42
42
  for (int32_t j = 0; j < n_toks; j++) {
43
- llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
43
+ common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
44
44
  }
45
45
 
46
46
  // clear previous kv_cache values (irrelevant for embeddings)
@@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
75
75
  }
76
76
 
77
77
  std::vector<float> emb_norm(emb_unorm.size());
78
- llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
78
+ common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
79
79
  result.push_back(emb_norm);
80
80
 
81
81
  #ifdef GRIT_DEBUG
@@ -105,16 +105,16 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
105
105
 
106
106
  llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
107
107
 
108
- std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
108
+ std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
109
109
  int32_t i_current_token = 0;
110
110
 
111
111
  while (true) {
112
- llama_batch_clear(bat);
112
+ common_batch_clear(bat);
113
113
  {
114
114
  const int32_t n_inputs = inputs.size();
115
115
 
116
116
  for (int32_t i = 0; i < n_inputs; i++) {
117
- llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
117
+ common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
118
118
  }
119
119
  }
120
120
  inputs.clear();
@@ -127,7 +127,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
127
127
  break;
128
128
  }
129
129
 
130
- std::string piece = llama_token_to_piece(ctx, token);
130
+ std::string piece = common_token_to_piece(ctx, token);
131
131
  if (stream) {
132
132
  std::printf("%s", piece.c_str());
133
133
  std::fflush(stdout);
@@ -152,16 +152,16 @@ static std::string gritlm_instruction(const std::string & instruction) {
152
152
  }
153
153
 
154
154
  int main(int argc, char * argv[]) {
155
- gpt_params params;
155
+ common_params params;
156
156
 
157
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
157
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
158
158
  return 1;
159
159
  }
160
160
 
161
- gpt_init();
161
+ common_init();
162
162
 
163
- llama_model_params mparams = llama_model_params_from_gpt_params(params);
164
- llama_context_params cparams = llama_context_params_from_gpt_params(params);
163
+ llama_model_params mparams = common_model_params_to_llama(params);
164
+ llama_context_params cparams = common_context_params_to_llama(params);
165
165
 
166
166
  llama_backend_init();
167
167
 
@@ -199,10 +199,10 @@ int main(int argc, char * argv[]) {
199
199
 
200
200
  const int n_embd = llama_n_embd(model);
201
201
 
202
- const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
203
- const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
204
- const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
205
- const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
202
+ const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
203
+ const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
204
+ const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
205
+ const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
206
206
 
207
207
  std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
208
208
  std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);