@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,4 +1,6 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <ctime>
@@ -31,13 +33,24 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
31
33
  }
32
34
 
33
35
  static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
36
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
37
+ const struct llama_model * model = llama_get_model(ctx);
38
+
34
39
  // clear previous kv_cache values (irrelevant for embeddings)
35
40
  llama_kv_cache_clear(ctx);
36
41
 
37
42
  // run model
38
- fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
39
- if (llama_decode(ctx, batch) < 0) {
40
- fprintf(stderr, "%s : failed to decode\n", __func__);
43
+ LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
44
+ if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
45
+ // encoder-only model
46
+ if (llama_encode(ctx, batch) < 0) {
47
+ LOG_ERR("%s : failed to encode\n", __func__);
48
+ }
49
+ } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
50
+ // decoder-only model
51
+ if (llama_decode(ctx, batch) < 0) {
52
+ LOG_ERR("%s : failed to decode\n", __func__);
53
+ }
41
54
  }
42
55
 
43
56
  for (int i = 0; i < batch.n_tokens; i++) {
@@ -45,11 +58,22 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
45
58
  continue;
46
59
  }
47
60
 
48
- // try to get sequence embeddings - supported only when pooling_type is not NONE
49
- const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
50
- GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
61
+ const float * embd = nullptr;
62
+ int embd_pos = 0;
63
+
64
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
65
+ // try to get token embeddings
66
+ embd = llama_get_embeddings_ith(ctx, i);
67
+ embd_pos = i;
68
+ GGML_ASSERT(embd != NULL && "failed to get token embeddings");
69
+ } else {
70
+ // try to get sequence embeddings - supported only when pooling_type is not NONE
71
+ embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
72
+ embd_pos = batch.seq_id[i][0];
73
+ GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
74
+ }
51
75
 
52
- float * out = output + batch.seq_id[i][0] * n_embd;
76
+ float * out = output + embd_pos * n_embd;
53
77
  llama_embd_normalize(embd, out, n_embd, embd_norm);
54
78
  }
55
79
  }
@@ -57,35 +81,26 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
57
81
  int main(int argc, char ** argv) {
58
82
  gpt_params params;
59
83
 
60
- if (!gpt_params_parse(argc, argv, params)) {
61
- gpt_params_print_usage(argc, argv, params);
84
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
62
85
  return 1;
63
86
  }
64
87
 
88
+ gpt_init();
89
+
65
90
  params.embedding = true;
66
91
  // For non-causal models, batch size must be equal to ubatch size
67
92
  params.n_ubatch = params.n_batch;
68
93
 
69
- print_build_info();
70
-
71
- if (params.seed == LLAMA_DEFAULT_SEED) {
72
- params.seed = time(NULL);
73
- }
74
-
75
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
76
-
77
- std::mt19937 rng(params.seed);
78
-
79
94
  llama_backend_init();
80
95
  llama_numa_init(params.numa);
81
96
 
82
- llama_model * model;
83
- llama_context * ctx;
84
-
85
97
  // load the model
86
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
98
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
99
+
100
+ llama_model * model = llama_init.model;
101
+ llama_context * ctx = llama_init.context;
87
102
  if (model == NULL) {
88
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
103
+ LOG_ERR("%s: unable to load model\n", __func__);
89
104
  return 1;
90
105
  }
91
106
 
@@ -93,20 +108,21 @@ int main(int argc, char ** argv) {
93
108
  const int n_ctx = llama_n_ctx(ctx);
94
109
 
95
110
  const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
96
- if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
97
- fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
111
+
112
+ if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
113
+ LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
98
114
  return 1;
99
115
  }
100
116
 
101
117
  if (n_ctx > n_ctx_train) {
102
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
118
+ LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
103
119
  __func__, n_ctx_train, n_ctx);
104
120
  }
105
121
 
106
122
  // print system information
107
123
  {
108
- fprintf(stderr, "\n");
109
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
124
+ LOG_INF("\n");
125
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
110
126
  }
111
127
 
112
128
  // split the prompt into lines
@@ -119,9 +135,9 @@ int main(int argc, char ** argv) {
119
135
  // tokenize the prompts and trim
120
136
  std::vector<std::vector<int32_t>> inputs;
121
137
  for (const auto & prompt : prompts) {
122
- auto inp = ::llama_tokenize(ctx, prompt, true, false);
138
+ auto inp = ::llama_tokenize(ctx, prompt, true, true);
123
139
  if (inp.size() > n_batch) {
124
- fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
140
+ LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
125
141
  __func__, (long long int) inp.size(), (long long int) n_batch);
126
142
  return 1;
127
143
  }
@@ -132,20 +148,20 @@ int main(int argc, char ** argv) {
132
148
  // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
133
149
  for (auto & inp : inputs) {
134
150
  if (inp.empty() || inp.back() != llama_token_sep(model)) {
135
- fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
136
- fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
151
+ LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
152
+ LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
137
153
  }
138
154
  }
139
155
 
140
156
  // tokenization stats
141
157
  if (params.verbose_prompt) {
142
158
  for (int i = 0; i < (int) inputs.size(); i++) {
143
- fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
144
- fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
159
+ LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
160
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
145
161
  for (int j = 0; j < (int) inputs[i].size(); j++) {
146
- fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
162
+ LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
147
163
  }
148
- fprintf(stderr, "\n\n");
164
+ LOG("\n\n");
149
165
  }
150
166
  }
151
167
 
@@ -153,13 +169,23 @@ int main(int argc, char ** argv) {
153
169
  const int n_prompts = prompts.size();
154
170
  struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
155
171
 
172
+ // count number of embeddings
173
+ int n_embd_count = 0;
174
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
175
+ for (int k = 0; k < n_prompts; k++) {
176
+ n_embd_count += inputs[k].size();
177
+ }
178
+ } else {
179
+ n_embd_count = n_prompts;
180
+ }
181
+
156
182
  // allocate output
157
183
  const int n_embd = llama_n_embd(model);
158
- std::vector<float> embeddings(n_prompts * n_embd, 0);
184
+ std::vector<float> embeddings(n_embd_count * n_embd, 0);
159
185
  float * emb = embeddings.data();
160
186
 
161
187
  // break into batches
162
- int p = 0; // number of prompts processed already
188
+ int e = 0; // number of embeddings already stored
163
189
  int s = 0; // number of prompts in current batch
164
190
  for (int k = 0; k < n_prompts; k++) {
165
191
  // clamp to n_batch tokens
@@ -169,11 +195,11 @@ int main(int argc, char ** argv) {
169
195
 
170
196
  // encode if at capacity
171
197
  if (batch.n_tokens + n_toks > n_batch) {
172
- float * out = emb + p * n_embd;
198
+ float * out = emb + e * n_embd;
173
199
  batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
174
- llama_batch_clear(batch);
175
- p += s;
200
+ e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
176
201
  s = 0;
202
+ llama_batch_clear(batch);
177
203
  }
178
204
 
179
205
  // add to batch
@@ -182,39 +208,67 @@ int main(int argc, char ** argv) {
182
208
  }
183
209
 
184
210
  // final batch
185
- float * out = emb + p * n_embd;
211
+ float * out = emb + e * n_embd;
186
212
  batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
187
213
 
188
214
  if (params.embd_out.empty()) {
189
- // print the first part of the embeddings or for a single prompt, the full embedding
190
- fprintf(stdout, "\n");
191
- for (int j = 0; j < n_prompts; j++) {
192
- fprintf(stdout, "embedding %d: ", j);
193
- for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
194
- if (params.embd_normalize == 0) {
195
- fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
196
- } else {
197
- fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
215
+ LOG("\n");
216
+
217
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
218
+ for (int j = 0; j < n_embd_count; j++) {
219
+ LOG("embedding %d: ", j);
220
+ for (int i = 0; i < std::min(3, n_embd); i++) {
221
+ if (params.embd_normalize == 0) {
222
+ LOG("%6.0f ", emb[j * n_embd + i]);
223
+ } else {
224
+ LOG("%9.6f ", emb[j * n_embd + i]);
225
+ }
198
226
  }
227
+ LOG(" ... ");
228
+ for (int i = n_embd - 3; i < n_embd; i++) {
229
+ if (params.embd_normalize == 0) {
230
+ LOG("%6.0f ", emb[j * n_embd + i]);
231
+ } else {
232
+ LOG("%9.6f ", emb[j * n_embd + i]);
233
+ }
234
+ }
235
+ LOG("\n");
199
236
  }
200
- fprintf(stdout, "\n");
201
- }
202
-
203
- // print cosine similarity matrix
204
- if (n_prompts > 1) {
205
- fprintf(stdout, "\n");
206
- printf("cosine similarity matrix:\n\n");
207
- for (int i = 0; i < n_prompts; i++) {
208
- fprintf(stdout, "%6.6s ", prompts[i].c_str());
237
+ } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
238
+ for (int j = 0; j < n_embd_count; j++) {
239
+ // NOTE: if you change this log - update the tests in ci/run.sh
240
+ LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
209
241
  }
210
- fprintf(stdout, "\n");
211
- for (int i = 0; i < n_prompts; i++) {
212
- for (int j = 0; j < n_prompts; j++) {
213
- float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
214
- fprintf(stdout, "%6.2f ", sim);
242
+ } else {
243
+ // print the first part of the embeddings or for a single prompt, the full embedding
244
+ for (int j = 0; j < n_prompts; j++) {
245
+ LOG("embedding %d: ", j);
246
+ for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
247
+ if (params.embd_normalize == 0) {
248
+ LOG("%6.0f ", emb[j * n_embd + i]);
249
+ } else {
250
+ LOG("%9.6f ", emb[j * n_embd + i]);
251
+ }
252
+ }
253
+ LOG("\n");
254
+ }
255
+
256
+ // print cosine similarity matrix
257
+ if (n_prompts > 1) {
258
+ LOG("\n");
259
+ LOG("cosine similarity matrix:\n\n");
260
+ for (int i = 0; i < n_prompts; i++) {
261
+ LOG("%6.6s ", prompts[i].c_str());
262
+ }
263
+ LOG("\n");
264
+ for (int i = 0; i < n_prompts; i++) {
265
+ for (int j = 0; j < n_prompts; j++) {
266
+ float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
267
+ LOG("%6.2f ", sim);
268
+ }
269
+ LOG("%1.10s", prompts[i].c_str());
270
+ LOG("\n");
215
271
  }
216
- fprintf(stdout, "%1.10s", prompts[i].c_str());
217
- fprintf(stdout, "\n");
218
272
  }
219
273
  }
220
274
  }
@@ -222,43 +276,45 @@ int main(int argc, char ** argv) {
222
276
  if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
223
277
  const bool notArray = params.embd_out != "array";
224
278
 
225
- fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
279
+ LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
226
280
  for (int j = 0;;) { // at least one iteration (one prompt)
227
- if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
228
- fprintf(stdout, "[");
281
+ if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
282
+ LOG("[");
229
283
  for (int i = 0;;) { // at least one iteration (n_embd > 0)
230
- fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
284
+ LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
231
285
  i++;
232
- if (i < n_embd) fprintf(stdout, ","); else break;
286
+ if (i < n_embd) LOG(","); else break;
233
287
  }
234
- fprintf(stdout, notArray ? "]\n }" : "]");
288
+ LOG(notArray ? "]\n }" : "]");
235
289
  j++;
236
- if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
290
+ if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
237
291
  }
238
- fprintf(stdout, notArray ? "\n ]" : "]\n");
292
+ LOG(notArray ? "\n ]" : "]\n");
239
293
 
240
294
  if (params.embd_out == "json+" && n_prompts > 1) {
241
- fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
242
- for (int i = 0;;) { // at least two iteration (n_prompts > 1)
243
- fprintf(stdout, " [");
244
- for (int j = 0;;) { // at least two iteration (n_prompts > 1)
295
+ LOG(",\n \"cosineSimilarity\": [\n");
296
+ for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
297
+ LOG(" [");
298
+ for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
245
299
  float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
246
- fprintf(stdout, "%6.2f", sim);
300
+ LOG("%6.2f", sim);
247
301
  j++;
248
- if (j < n_prompts) fprintf(stdout, ", "); else break;
302
+ if (j < n_embd_count) LOG(", "); else break;
249
303
  }
250
- fprintf(stdout, " ]");
304
+ LOG(" ]");
251
305
  i++;
252
- if (i < n_prompts) fprintf(stdout, ",\n"); else break;
306
+ if (i < n_embd_count) LOG(",\n"); else break;
253
307
  }
254
- fprintf(stdout, "\n ]");
308
+ LOG("\n ]");
255
309
  }
256
310
 
257
- if (notArray) fprintf(stdout, "\n}\n");
311
+ if (notArray) LOG("\n}\n");
258
312
  }
259
313
 
314
+ LOG("\n");
315
+ llama_perf_context_print(ctx);
316
+
260
317
  // clean up
261
- llama_print_timings(ctx);
262
318
  llama_batch_free(batch);
263
319
  llama_free(ctx);
264
320
  llama_free_model(model);
@@ -1,11 +1,11 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
  #include "ggml.h"
4
6
 
5
7
  #include <cstdio>
6
- #include <random>
7
8
  #include <string>
8
- #include <tuple>
9
9
  #include <vector>
10
10
 
11
11
  /**
@@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
31
31
  GGML_ASSERT(n > 0);
32
32
  float sum = 0;
33
33
  for (int64_t i3 = 0; i3 < ne[3]; i3++) {
34
- printf(" [\n");
34
+ LOG(" [\n");
35
35
  for (int64_t i2 = 0; i2 < ne[2]; i2++) {
36
36
  if (i2 == n && ne[2] > 2*n) {
37
- printf(" ..., \n");
37
+ LOG(" ..., \n");
38
38
  i2 = ne[2] - n;
39
39
  }
40
- printf(" [\n");
40
+ LOG(" [\n");
41
41
  for (int64_t i1 = 0; i1 < ne[1]; i1++) {
42
42
  if (i1 == n && ne[1] > 2*n) {
43
- printf(" ..., \n");
43
+ LOG(" ..., \n");
44
44
  i1 = ne[1] - n;
45
45
  }
46
- printf(" [");
46
+ LOG(" [");
47
47
  for (int64_t i0 = 0; i0 < ne[0]; i0++) {
48
48
  if (i0 == n && ne[0] > 2*n) {
49
- printf("..., ");
49
+ LOG("..., ");
50
50
  i0 = ne[0] - n;
51
51
  }
52
52
  size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
64
64
  } else {
65
65
  GGML_ABORT("fatal error");
66
66
  }
67
- printf("%12.4f", v);
67
+ LOG("%12.4f", v);
68
68
  sum += v;
69
- if (i0 < ne[0] - 1) printf(", ");
69
+ if (i0 < ne[0] - 1) LOG(", ");
70
70
  }
71
- printf("],\n");
71
+ LOG("],\n");
72
72
  }
73
- printf(" ],\n");
73
+ LOG(" ],\n");
74
74
  }
75
- printf(" ]\n");
76
- printf(" sum = %f\n", sum);
75
+ LOG(" ]\n");
76
+ LOG(" sum = %f\n", sum);
77
77
  }
78
78
  }
79
79
 
@@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
102
102
  snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
103
103
  }
104
104
 
105
- printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
106
- t->name, ggml_type_name(t->type), ggml_op_desc(t),
107
- src0->name, ggml_ne_string(src0).c_str(),
108
- src1 ? src1_str : "",
109
- ggml_ne_string(t).c_str());
105
+ LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
106
+ t->name, ggml_type_name(t->type), ggml_op_desc(t),
107
+ src0->name, ggml_ne_string(src0).c_str(),
108
+ src1 ? src1_str : "",
109
+ ggml_ne_string(t).c_str());
110
110
 
111
111
 
112
112
  // copy the data from the GPU memory if needed
@@ -127,12 +127,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
127
127
  }
128
128
 
129
129
  static bool run(llama_context * ctx, const gpt_params & params) {
130
- const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
130
+ const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
131
131
 
132
132
  std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
133
133
 
134
134
  if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
135
- fprintf(stderr, "%s : failed to eval\n", __func__);
135
+ LOG_ERR("%s : failed to eval\n", __func__);
136
136
  return false;
137
137
  }
138
138
 
@@ -144,14 +144,11 @@ int main(int argc, char ** argv) {
144
144
 
145
145
  gpt_params params;
146
146
 
147
- if (!gpt_params_parse(argc, argv, params)) {
148
- gpt_params_print_usage(argc, argv, params);
147
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
149
148
  return 1;
150
149
  }
151
150
 
152
- print_build_info();
153
-
154
- std::mt19937 rng(params.seed);
151
+ gpt_init();
155
152
 
156
153
  llama_backend_init();
157
154
  llama_numa_init(params.numa);
@@ -163,18 +160,20 @@ int main(int argc, char ** argv) {
163
160
  params.warmup = false;
164
161
 
165
162
  // init
166
- llama_model * model;
167
- llama_context * ctx;
168
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
163
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
164
+
165
+ llama_model * model = llama_init.model;
166
+ llama_context * ctx = llama_init.context;
169
167
  if (model == nullptr || ctx == nullptr) {
170
- fprintf(stderr, "%s : failed to init\n", __func__);
168
+ LOG_ERR("%s : failed to init\n", __func__);
171
169
  return 1;
172
170
  }
173
171
 
174
172
  // print system information
175
173
  {
176
- fprintf(stderr, "\n");
177
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
174
+ LOG_INF("\n");
175
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
176
+ LOG_INF("\n");
178
177
  }
179
178
 
180
179
  bool OK = run(ctx, params);
@@ -182,7 +181,8 @@ int main(int argc, char ** argv) {
182
181
  return 1;
183
182
  }
184
183
 
185
- llama_print_timings(ctx);
184
+ LOG("\n");
185
+ llama_perf_context_print(ctx);
186
186
 
187
187
  llama_free(ctx);
188
188
  llama_free_model(model);