@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -9,6 +9,7 @@
9
9
  #include <climits>
10
10
  #include <cstring>
11
11
  #include <cstdarg>
12
+ #include <cinttypes>
12
13
  #include <ctime>
13
14
  #include <random>
14
15
  #include <stdexcept>
@@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
105
106
  const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
106
107
  try {
107
108
  w->token_embedding_table.resize(p->vocab_size * p->dim);
108
- LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
109
+ LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
109
110
 
110
111
  w->rms_att_weight.resize(p->n_layers * p->dim);
111
- LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
112
+ LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
112
113
 
113
114
  w->rms_ffn_weight.resize(p->n_layers * p->dim);
114
- LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
115
+ LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
115
116
 
116
117
  w->wq.resize(p->n_layers * p->dim * p->dim);
117
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
118
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
118
119
 
119
120
  w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
120
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
121
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
121
122
 
122
123
  w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
123
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
124
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
124
125
 
125
126
  w->wo.resize(p->n_layers * p->dim * p->dim);
126
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
127
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
127
128
 
128
129
  w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
129
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
130
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
130
131
 
131
132
  w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
132
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
133
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
133
134
 
134
135
  w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
135
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
136
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
136
137
 
137
138
  w->rms_final_weight.resize(p->dim);
138
- LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
139
+ LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
139
140
 
140
141
  if (shared_weights) {
141
142
  w->wcls = {};
142
143
  } else {
143
144
  w->wcls.resize(p->vocab_size * p->dim);
144
- LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
145
+ LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
145
146
  }
146
147
  }
147
148
  catch (std::length_error &) {
@@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
173
174
  fseek(f, 0, SEEK_END);
174
175
  auto end = ftell(f);
175
176
  if (curr != end) {
176
- LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
177
+ LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
177
178
  return 1;
178
179
  }
179
180
 
@@ -181,26 +182,26 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
181
182
  }
182
183
 
183
184
  static void print_sample_weights(TransformerWeights *w){
184
- LOG("----- Quick print of first of the weight vales of all the variables\n");
185
- LOG("%f\n", w->token_embedding_table[0]);
186
- LOG("%f\n", w->rms_att_weight[0]);
187
- LOG("%f\n", w->rms_ffn_weight[0]);
188
-
189
- LOG("%f\n", w->wq[0]);
190
- LOG("%f\n", w->wk[0]);
191
- LOG("%f\n", w->wv[0]);
192
- LOG("%f\n", w->wo[0]);
193
- LOG("%f\n", w->w1[0]);
194
- LOG("%f\n", w->w2[0]);
195
- LOG("%f\n", w->w3[0]);
196
- LOG("%f\n", w->rms_att_weight[0]);
197
- if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
185
+ LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
186
+ LOG_INF("%f\n", w->token_embedding_table[0]);
187
+ LOG_INF("%f\n", w->rms_att_weight[0]);
188
+ LOG_INF("%f\n", w->rms_ffn_weight[0]);
189
+
190
+ LOG_INF("%f\n", w->wq[0]);
191
+ LOG_INF("%f\n", w->wk[0]);
192
+ LOG_INF("%f\n", w->wv[0]);
193
+ LOG_INF("%f\n", w->wo[0]);
194
+ LOG_INF("%f\n", w->w1[0]);
195
+ LOG_INF("%f\n", w->w2[0]);
196
+ LOG_INF("%f\n", w->w3[0]);
197
+ LOG_INF("%f\n", w->rms_att_weight[0]);
198
+ if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
198
199
  }
199
200
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
200
201
 
201
202
  //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
202
203
 
203
- struct llama_vocab {
204
+ struct my_llama_vocab {
204
205
  using id = int32_t;
205
206
  using token = std::string;
206
207
  using ttype = llama_token_type;
@@ -318,20 +319,20 @@ struct train_params {
318
319
  };
319
320
 
320
321
  static void print_params(struct my_llama_hparams * params) {
321
- LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
322
- LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
323
- LOG("%s: n_embd: %u\n", __func__, params->n_embd);
324
- LOG("%s: n_mult: %u\n", __func__, params->n_mult);
325
- LOG("%s: n_head: %u\n", __func__, params->n_head);
326
- LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
327
- LOG("%s: n_ff: %u\n", __func__, params->n_ff);
328
- LOG("%s: n_layer: %u\n", __func__, params->n_layer);
329
- LOG("%s: n_rot: %u\n", __func__, params->n_rot);
322
+ LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
323
+ LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
324
+ LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
325
+ LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
326
+ LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
327
+ LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
328
+ LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
329
+ LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
330
+ LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
330
331
  }
331
332
 
332
333
  static void print_tensor_info(const struct ggml_context * ctx) {
333
334
  for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
334
- LOG("%s: Allocating ", __func__);
335
+ LOG_INF("%s: Allocating ", __func__);
335
336
  int64_t total = 1;
336
337
  int i = 0;
337
338
  for (; i < ggml_n_dims(t); ++i) {
@@ -524,9 +525,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
524
525
  return out.str();
525
526
  }
526
527
 
527
- static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
528
+ static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
528
529
  if (is_ggml_file(filename)) {
529
- LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
530
+ LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
530
531
  struct ggml_context * ctx_data = NULL;
531
532
 
532
533
  struct gguf_init_params params = {
@@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
574
575
  gguf_free(ctx);
575
576
  } else {
576
577
  // assume llama2.c vocabulary
577
- LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
578
+ LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
578
579
  llama_file file(filename, "rb");
579
580
  if (!file.fp) {
580
581
  die_fmt("%s: %s", strerror(errno), filename);
@@ -582,13 +583,13 @@ static void load_vocab(const char * filename, const Config * config, struct llam
582
583
  const int n_vocab = config->vocab_size;
583
584
  /* uint32_t max_token_length = */ file.read_u32(); // unused
584
585
  vocab->id_to_token.resize(n_vocab);
585
- for (llama_vocab::id id=0; id<n_vocab; ++id) {
586
+ for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
586
587
  float_t score = file.read_f32();
587
588
  uint32_t len = file.read_u32();
588
589
  std::string text = file.read_string(len);
589
590
 
590
591
  unsigned char byte_val;
591
- llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
592
+ my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
592
593
  if (id == UNKNOWN_TOKEN_ID) {
593
594
  text = "<unk>";
594
595
  type = LLAMA_TOKEN_TYPE_UNKNOWN;
@@ -630,7 +631,7 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
630
631
  }
631
632
 
632
633
  static void save_as_llama_model(
633
- struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
634
+ struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
634
635
  ) {
635
636
  // convert AK weights into GG weights one by one.
636
637
  // w->token_embedding_table -> model->tok_embeddings
@@ -670,7 +671,7 @@ static void save_as_llama_model(
670
671
  std::vector<const char*> tokens;
671
672
  std::vector<float> scores;
672
673
  std::vector<llama_token_type> token_types;
673
- for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
674
+ for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
674
675
  tokens.push_back(token_data.text.c_str());
675
676
  scores.push_back(token_data.score);
676
677
  token_types.push_back(token_data.type);
@@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
871
872
  }
872
873
 
873
874
  int main(int argc, char ** argv) {
875
+ gpt_init();
876
+
874
877
  struct train_params params = get_default_train_params();
875
878
  if (!params_parse(argc, argv, &params)) {
876
879
  return 1;
877
880
  }
878
- log_set_target(stdout);
881
+
879
882
  Config config;
880
883
  TransformerWeights weights = {};
881
884
  {
882
- LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
885
+ LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
883
886
  FILE * file = fopen(params.fn_llama2c_model, "rb");
884
887
  if (!file) {
885
- LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
888
+ LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
886
889
  return 1;
887
890
  }
888
891
  // read in the config header
889
892
  if (fread(&config, sizeof(Config), 1, file) != 1) {
890
- LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
893
+ LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
891
894
  return 1;
892
895
  }
893
896
  auto shared_weights = config.vocab_size > 0;
@@ -896,13 +899,13 @@ int main(int argc, char ** argv) {
896
899
  // read in the Transformer weights
897
900
  alloc_weights(&weights, &config, shared_weights);
898
901
  if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
899
- LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
902
+ LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
900
903
  return 1;
901
904
  }
902
905
  fclose(file);
903
906
  }
904
907
 
905
- struct llama_vocab vocab;
908
+ struct my_llama_vocab vocab;
906
909
  load_vocab(params.fn_vocab_model, &config, &vocab);
907
910
 
908
911
  struct my_llama_model model;
@@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
929
932
  model.name = basename(params.fn_llama2c_model);
930
933
  save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
931
934
 
932
- LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
935
+ LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
933
936
 
934
937
  ggml_free(model.ctx);
935
938
  return 0;
@@ -1,3 +1,4 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
2
3
  #include "llama.h"
3
4
  #include "ggml.h"
@@ -12,14 +13,15 @@
12
13
  #include "ggml-metal.h"
13
14
  #endif
14
15
 
16
+ #include <algorithm>
17
+ #include <climits>
15
18
  #include <cstdio>
19
+ #include <cstring>
20
+ #include <fstream>
21
+ #include <iostream>
16
22
  #include <string>
17
23
  #include <tuple>
18
24
  #include <vector>
19
- #include <algorithm>
20
- #include <iostream>
21
- #include <fstream>
22
- #include <climits>
23
25
 
24
26
 
25
27
  //////////////////////////////////////////////////
@@ -35,9 +37,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
35
37
  return ret;
36
38
  }
37
39
 
38
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
39
- gpt_params_print_usage(argc, argv, params);
40
-
40
+ static void print_usage(int, char ** argv) {
41
41
  printf("\nexample usage:\n");
42
42
  printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
43
43
  printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@@ -271,7 +271,7 @@ struct tokenized_prompt {
271
271
  size_t max_seq_len;
272
272
 
273
273
  tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
274
- const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
274
+ const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
275
275
  tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
276
276
  tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
277
277
  max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@@ -390,8 +390,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
390
390
  int main(int argc, char ** argv) {
391
391
  gpt_params params;
392
392
 
393
- if (!gpt_params_parse(argc, argv, params)) {
394
- print_usage(argc, argv, params);
393
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
395
394
  return 1;
396
395
  }
397
396
 
@@ -414,9 +413,10 @@ int main(int argc, char ** argv) {
414
413
  llama_numa_init(params.numa);
415
414
 
416
415
  // load the model to get hparams
417
- llama_model * model;
418
- llama_context * ctx;
419
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
416
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
417
+
418
+ llama_model * model = llama_init.model;
419
+ llama_context * ctx = llama_init.context;
420
420
 
421
421
  // int n_ctx = llama_n_ctx(ctx);
422
422
  int n_layers = llama_n_layer(model);
@@ -485,8 +485,8 @@ int main(int argc, char ** argv) {
485
485
  if (use_pca) {
486
486
  // run PCA
487
487
  PCA::pca_params pca_params;
488
- pca_params.n_threads = params.n_threads;
489
- pca_params.n_batch = params.n_pca_batch;
488
+ pca_params.n_threads = params.cpuparams.n_threads;
489
+ pca_params.n_batch = params.n_pca_batch;
490
490
  pca_params.n_iterations = params.n_pca_iterations;
491
491
  PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
492
492
  } else {
@@ -12,12 +12,9 @@
12
12
 
13
13
  #include <cstdio>
14
14
  #include <ctime>
15
+ #include <random>
15
16
  #include <string>
16
- #include <tuple>
17
17
  #include <vector>
18
- #include <algorithm>
19
- #include <iostream>
20
- #include <fstream>
21
18
 
22
19
  #define DEBUG_POS 5
23
20
 
@@ -207,13 +204,6 @@ static ggml_status compute_piter(
207
204
  ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
208
205
  }
209
206
 
210
- // TODO: enable GPU support when support for GGML_OP_SQRT is added
211
- //#ifdef GGML_USE_METAL
212
- // if (ggml_backend_is_metal(model.backend)) {
213
- // ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
214
- // }
215
- //#endif
216
-
217
207
  ggml_status res = ggml_backend_graph_compute(model.backend, gf);
218
208
  if (res == GGML_STATUS_SUCCESS) {
219
209
  auto extract_i = [](std::string prefix, std::string str) -> int {
@@ -229,8 +219,8 @@ static ggml_status compute_piter(
229
219
  result.eigenvectors.resize(params.n_batch);
230
220
  result.distances.resize(params.n_batch);
231
221
  // get output nodes
232
- for (int i = 0; i < gf->n_nodes; ++i) {
233
- auto node = gf->nodes[i];
222
+ for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
223
+ auto node = ggml_graph_node(gf, i);
234
224
  int iter = -1;
235
225
  // find b_tensor (without copying data from device)
236
226
  if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {