@fugood/llama.node 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +243 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +14 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  136. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  138. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
  143. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  144. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  145. package/src/llama.cpp/include/llama.h +30 -11
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  147. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  149. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  150. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  151. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  152. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  153. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  154. package/src/llama.cpp/src/llama-arch.cpp +161 -17
  155. package/src/llama.cpp/src/llama-arch.h +16 -0
  156. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  157. package/src/llama.cpp/src/llama-chat.h +6 -2
  158. package/src/llama.cpp/src/llama-context.cpp +108 -92
  159. package/src/llama.cpp/src/llama-context.h +1 -2
  160. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  161. package/src/llama.cpp/src/llama-graph.h +26 -6
  162. package/src/llama.cpp/src/llama-hparams.h +13 -0
  163. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  164. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  165. package/src/llama.cpp/src/llama-memory.h +1 -1
  166. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  167. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  168. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  169. package/src/llama.cpp/src/llama-model.cpp +1544 -291
  170. package/src/llama.cpp/src/llama-model.h +13 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  172. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  173. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  174. package/src/llama.cpp/src/llama.cpp +1 -1
  175. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  176. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  177. package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
  178. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  179. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  180. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  181. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  182. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  183. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  184. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  185. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  186. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  188. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  189. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  190. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  191. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  192. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  193. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  203. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -2,7 +2,6 @@
2
2
  # undef NDEBUG
3
3
  #endif
4
4
 
5
- #include "unicode.h"
6
5
  #include "sampling.h"
7
6
 
8
7
  #include <cassert>
@@ -84,7 +83,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
84
83
 
85
84
  fprintf(stderr,
86
85
  "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following "
87
- "command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf "
86
+ "command: ./test-gbnf-validator test-grammar-integration.grammar.gbnf "
88
87
  "test-grammar-integration.string.txt\n\n");
89
88
  } else {
90
89
  fprintf(stdout, "✅︎\n");
@@ -1086,6 +1085,65 @@ static void test_json_schema() {
1086
1085
  });
1087
1086
  }
1088
1087
 
1088
+ static void one_hot(llama_token_data_array & tok_arr, llama_token selected) {
1089
+ auto n_vocab = tok_arr.size;
1090
+
1091
+ tok_arr.selected = -1;
1092
+ tok_arr.sorted = false;
1093
+ for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
1094
+ tok_arr.data[token_id].id = token_id;
1095
+ tok_arr.data[token_id].logit = 0.0f;
1096
+ }
1097
+
1098
+ tok_arr.data[selected].logit = 100.0f;
1099
+ }
1100
+
1101
+ static void test_sampler_chain(void) {
1102
+ auto sparams = llama_sampler_chain_default_params();
1103
+ sparams.no_perf = false;
1104
+ llama_sampler * sampler = llama_sampler_chain_init(sparams);
1105
+
1106
+ const auto grammar_data = R"(%llguidance {}
1107
+ start: /[A-Z ]*/)";
1108
+
1109
+ llama_sampler_chain_add(sampler, llama_sampler_init_llg(vocab, "lark", grammar_data));
1110
+ llama_sampler_chain_add(sampler, llama_sampler_init_dist(42));
1111
+
1112
+ auto input = "ALL YOUR BASE ARE BELONG TO US";
1113
+ auto tokens = common_tokenize(vocab, input, false, false);
1114
+
1115
+ auto n_vocab = llama_vocab_n_tokens(vocab);
1116
+
1117
+ std::vector<llama_token_data> cur;
1118
+ cur.reserve(n_vocab);
1119
+ for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
1120
+ cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
1121
+ }
1122
+ auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
1123
+
1124
+ for (const auto token : tokens) {
1125
+ one_hot(tok_arr, token);
1126
+
1127
+ fprintf(stderr, "applying token: %d\n", token);
1128
+ llama_sampler_apply(sampler, &tok_arr);
1129
+
1130
+ auto idx = tok_arr.selected;
1131
+ fprintf(stderr, " -> %d %f\n", cur[idx].id, cur[idx].logit);
1132
+ assert(cur[tok_arr.selected].id == token);
1133
+ llama_sampler_accept(sampler, token);
1134
+ }
1135
+
1136
+ auto tok_eos = llama_vocab_eot(vocab);
1137
+ if (tok_eos == LLAMA_TOKEN_NULL) {
1138
+ tok_eos = llama_vocab_eos(vocab);
1139
+ }
1140
+
1141
+ one_hot(tok_arr, tok_eos);
1142
+
1143
+ llama_sampler_apply(sampler, &tok_arr);
1144
+ assert(cur[tok_arr.selected].id == tok_eos);
1145
+ }
1146
+
1089
1147
  int main(int argc, const char ** argv) {
1090
1148
  fprintf(stdout, "Running llguidance integration tests...\n");
1091
1149
 
@@ -1135,6 +1193,9 @@ int main(int argc, const char ** argv) {
1135
1193
  test_special_chars();
1136
1194
  test_quantifiers();
1137
1195
  test_json_schema();
1196
+
1197
+ test_sampler_chain();
1198
+
1138
1199
  fprintf(stdout, "All tests passed.\n");
1139
1200
  return 0;
1140
1201
  }
@@ -3,7 +3,9 @@
3
3
  #endif
4
4
 
5
5
  #include "llama.h"
6
- #include "llama-grammar.h"
6
+
7
+ // TODO: shold not include libllama sources
8
+ #include "../src/llama-grammar.h"
7
9
 
8
10
  #include <cassert>
9
11
 
@@ -4,7 +4,7 @@
4
4
 
5
5
  #include "json-schema-to-grammar.h"
6
6
 
7
- #include "llama-grammar.h"
7
+ #include "../src/llama-grammar.h"
8
8
 
9
9
  #include <cassert>
10
10
  #include <fstream>
@@ -597,6 +597,22 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
597
597
  )"""
598
598
  });
599
599
 
600
+ test({
601
+ SUCCESS,
602
+ "maxItems 0",
603
+ R"""({
604
+ "items": {
605
+ "type": "boolean"
606
+ },
607
+ "maxItems": 0
608
+ })""",
609
+ R"""(
610
+ boolean ::= ("true" | "false") space
611
+ root ::= "[" space "]" space
612
+ space ::= | " " | "\n"{1,2} [ \t]{0,20}
613
+ )"""
614
+ });
615
+
600
616
  test({
601
617
  SUCCESS,
602
618
  "maxItems 1",
@@ -3,7 +3,8 @@
3
3
  #endif
4
4
 
5
5
  #include "llama.h"
6
- #include "llama-grammar.h"
6
+
7
+ #include "../src/llama-grammar.h"
7
8
 
8
9
  #include <cassert>
9
10
  #include <stdexcept>
@@ -1,8 +1,10 @@
1
1
  #include "ggml.h"
2
+ #include "ggml-cpu.h"
2
3
  #include "llama.h"
3
- #include "llama-model.h"
4
4
  #include "common.h"
5
5
 
6
+ #include "../src/llama-model.h"
7
+
6
8
  #include <algorithm>
7
9
  #include <cassert>
8
10
  #include <cinttypes>
@@ -1,8 +1,9 @@
1
1
  #include "llama.h"
2
2
  #include "common.h"
3
- #include "unicode.h"
4
3
  #include "console.h"
5
4
 
5
+ #include "../src/unicode.h"
6
+
6
7
  #include <cassert>
7
8
  #include <codecvt>
8
9
  #include <cstdio>
@@ -1,8 +1,9 @@
1
1
  #include "llama.h"
2
2
  #include "common.h"
3
- #include "unicode.h"
4
3
  #include "console.h"
5
4
 
5
+ #include "../src/unicode.h"
6
+
6
7
  #include <cassert>
7
8
  #include <codecvt>
8
9
  #include <cstdio>
@@ -1,5 +0,0 @@
1
- set(TARGET llama-gbnf-validator)
2
- add_executable(${TARGET} gbnf-validator.cpp)
3
- install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -1,341 +0,0 @@
1
- #include "arg.h"
2
- #include "log.h"
3
- #include "common.h"
4
- #include "sampling.h"
5
- #include "clip.h"
6
- #include "stb_image.h"
7
- #include "llama.h"
8
- #include "ggml.h"
9
- #include "console.h"
10
-
11
- #include <vector>
12
- #include <limits.h>
13
- #include <inttypes.h>
14
-
15
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
16
- #include <signal.h>
17
- #include <unistd.h>
18
- #elif defined (_WIN32)
19
- #define WIN32_LEAN_AND_MEAN
20
- #ifndef NOMINMAX
21
- #define NOMINMAX
22
- #endif
23
- #include <windows.h>
24
- #include <signal.h>
25
- #endif
26
-
27
- static bool g_is_generating = false;
28
-
29
- /**
30
- * Please note that this is NOT a production-ready stuff.
31
- * It is a playground for trying Gemma 3 vision capabilities.
32
- * For contributors: please keep this code simple and easy to understand.
33
- */
34
-
35
- static void show_additional_info(int /*argc*/, char ** argv) {
36
- LOG(
37
- "Experimental CLI for using Gemma 3 vision model\n\n"
38
- "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
39
- " -m and --mmproj are required\n"
40
- " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
41
- argv[0]
42
- );
43
- }
44
-
45
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
46
- static void sigint_handler(int signo) {
47
- if (signo == SIGINT) {
48
- if (g_is_generating) {
49
- g_is_generating = false;
50
- } else {
51
- console::cleanup();
52
- LOG("\nInterrupted by user\n");
53
- _exit(130);
54
- }
55
- }
56
- }
57
- #endif
58
-
59
- struct gemma3_context {
60
- struct clip_ctx * ctx_clip = NULL;
61
- common_init_result llama_init;
62
-
63
- llama_model * model;
64
- llama_context * lctx;
65
- const llama_vocab * vocab;
66
- llama_batch batch;
67
-
68
- int n_threads = 1;
69
- llama_pos n_past = 0;
70
-
71
- gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
72
- model = llama_init.model.get();
73
- lctx = llama_init.context.get();
74
- vocab = llama_model_get_vocab(model);
75
- n_threads = params.cpuparams.n_threads;
76
- batch = llama_batch_init(params.n_batch, 0, 1);
77
- init_clip_model(params);
78
- }
79
-
80
- void init_clip_model(common_params & params) {
81
- const char * clip_path = params.mmproj.c_str();
82
- ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
83
- }
84
-
85
- ~gemma3_context() {
86
- clip_free(ctx_clip);
87
- }
88
- };
89
-
90
- struct decode_embd_batch {
91
- std::vector<llama_pos> pos;
92
- std::vector<int32_t> n_seq_id;
93
- std::vector<llama_seq_id> seq_id_0;
94
- std::vector<llama_seq_id *> seq_ids;
95
- std::vector<int8_t> logits;
96
- llama_batch batch;
97
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
98
- pos .resize(n_tokens);
99
- n_seq_id.resize(n_tokens);
100
- seq_ids .resize(n_tokens + 1);
101
- logits .resize(n_tokens);
102
- seq_id_0.resize(1);
103
- seq_id_0[0] = seq_id;
104
- seq_ids [n_tokens] = nullptr;
105
- batch = {
106
- /*n_tokens =*/ n_tokens,
107
- /*tokens =*/ nullptr,
108
- /*embd =*/ embd,
109
- /*pos =*/ pos.data(),
110
- /*n_seq_id =*/ n_seq_id.data(),
111
- /*seq_id =*/ seq_ids.data(),
112
- /*logits =*/ logits.data(),
113
- };
114
- for (int i = 0; i < n_tokens; i++) {
115
- batch.pos [i] = pos_0 + i;
116
- batch.n_seq_id[i] = 1;
117
- batch.seq_id [i] = seq_id_0.data();
118
- batch.logits [i] = false;
119
- }
120
- }
121
- };
122
-
123
- static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
124
- llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
125
- common_batch_clear(ctx.batch);
126
- for (llama_token & t : tokens) {
127
- common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
128
- }
129
- if (logits_last) {
130
- ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
131
- }
132
- // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
133
- if (llama_decode(ctx.lctx, ctx.batch)) {
134
- LOG_ERR("Failed to decode text\n");
135
- return 1;
136
- }
137
- return 0;
138
- }
139
-
140
- static int eval_image(gemma3_context & ctx, std::string & fname) {
141
- std::vector<float> image_embd_v;
142
- int n_embd = llama_model_n_embd(ctx.model);
143
- int n_tokens = 256;
144
- image_embd_v.resize(n_tokens * n_embd);
145
-
146
- bool ok;
147
- struct clip_image_u8 * img_u8 = clip_image_u8_init();
148
- ok = clip_image_load_from_file(fname.c_str(), img_u8);
149
- if (!ok) {
150
- LOG_ERR("Unable to load image %s\n", fname.c_str());
151
- clip_image_u8_free(img_u8);
152
- return 2; // non-fatal error
153
- }
154
-
155
- clip_image_f32_batch batch_f32;
156
- ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
157
- if (!ok) {
158
- LOG_ERR("Unable to preprocess image\n");
159
- clip_image_f32_batch_free(&batch_f32);
160
- clip_image_u8_free(img_u8);
161
- return 1;
162
- }
163
-
164
- int64_t t0 = ggml_time_ms();
165
- LOG("Encoding image %s\n", fname.c_str());
166
- ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
167
- if (!ok) {
168
- LOG_ERR("Unable to encode image\n");
169
- clip_image_f32_batch_free(&batch_f32);
170
- clip_image_u8_free(img_u8);
171
- return 1;
172
- }
173
- LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
174
-
175
- clip_image_f32_batch_free(&batch_f32);
176
- clip_image_u8_free(img_u8);
177
-
178
- // decode image embeddings
179
- int64_t t1 = ggml_time_ms();
180
- eval_text(ctx, "<start_of_image>");
181
- llama_set_causal_attn(ctx.lctx, false);
182
- decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
183
- if (llama_decode(ctx.lctx, batch_img.batch)) {
184
- LOG_ERR("failed to decode image\n");
185
- return 1;
186
- }
187
- ctx.n_past += n_tokens;
188
- llama_set_causal_attn(ctx.lctx, true);
189
- eval_text(ctx, "<end_of_image>");
190
- LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
191
- return 0;
192
- }
193
-
194
- static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
195
- for (int i = 0; i < n_predict; i++) {
196
- if (i > n_predict || !g_is_generating) {
197
- printf("\n");
198
- break;
199
- }
200
-
201
- llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
202
- common_sampler_accept(smpl, token_id, true);
203
-
204
- if (llama_vocab_is_eog(ctx.vocab, token_id)) {
205
- printf("\n");
206
- break; // end of generation
207
- }
208
-
209
- printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
210
- fflush(stdout);
211
-
212
- // eval the token
213
- common_batch_clear(ctx.batch);
214
- common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
215
- if (llama_decode(ctx.lctx, ctx.batch)) {
216
- LOG_ERR("failed to decode token\n");
217
- return 1;
218
- }
219
- }
220
- return 0;
221
- }
222
-
223
- int main(int argc, char ** argv) {
224
- ggml_time_init();
225
-
226
- common_params params;
227
- params.sampling.temp = 0.2; // lower temp by default for better quality
228
-
229
- if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
230
- return 1;
231
- }
232
-
233
- common_init();
234
-
235
- if (params.mmproj.empty()) {
236
- show_additional_info(argc, argv);
237
- return 1;
238
- }
239
-
240
- gemma3_context ctx(params);
241
- printf("%s: %s\n", __func__, params.model.c_str());
242
-
243
- bool is_single_turn = !params.prompt.empty() && !params.image.empty();
244
-
245
- struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
246
- int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
247
-
248
- // ctrl+C handling
249
- {
250
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
251
- struct sigaction sigint_action;
252
- sigint_action.sa_handler = sigint_handler;
253
- sigemptyset (&sigint_action.sa_mask);
254
- sigint_action.sa_flags = 0;
255
- sigaction(SIGINT, &sigint_action, NULL);
256
- #elif defined (_WIN32)
257
- auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
258
- return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
259
- };
260
- SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
261
- #endif
262
- }
263
-
264
- if (eval_text(ctx, "<bos>")) {
265
- return 1;
266
- }
267
-
268
- if (is_single_turn) {
269
- g_is_generating = true;
270
- if (eval_text(ctx, "<start_of_turn>user\n")) {
271
- return 1;
272
- }
273
- for (auto & fname : params.image) {
274
- if (eval_image(ctx, fname)) {
275
- return 1;
276
- }
277
- }
278
- if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
279
- return 1;
280
- }
281
- if (generate_response(ctx, smpl, n_predict)) {
282
- return 1;
283
- }
284
-
285
- } else {
286
- LOG("\n Running in chat mode, available commands:");
287
- LOG("\n /image <path> load an image");
288
- LOG("\n /clear clear the chat history");
289
- LOG("\n /quit or /exit exit the program");
290
- LOG("\n");
291
-
292
- if (eval_text(ctx, "<start_of_turn>user\n")) {
293
- return 1;
294
- }
295
-
296
- while (true) {
297
- g_is_generating = false;
298
- LOG("\n> ");
299
- console::set_display(console::user_input);
300
- std::string line;
301
- console::readline(line, false);
302
- console::set_display(console::reset);
303
- line = string_strip(line);
304
- if (line.empty()) {
305
- continue;
306
- }
307
- if (line == "/quit" || line == "/exit") {
308
- break;
309
- }
310
- if (line == "/clear") {
311
- ctx.n_past = 0;
312
- llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
313
- LOG("Chat history cleared\n\n");
314
- continue;
315
- }
316
- g_is_generating = true;
317
- if (line.find("/image") == 0) {
318
- std::string image = line.substr(7);
319
- int res = eval_image(ctx, image);
320
- if (res == 2) {
321
- continue; // image not found
322
- }
323
- if (res) {
324
- return 1;
325
- }
326
- continue;
327
- }
328
- if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
329
- return 1;
330
- }
331
- if (generate_response(ctx, smpl, n_predict)) {
332
- return 1;
333
- }
334
- if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
335
- return 1;
336
- }
337
- }
338
- }
339
-
340
- return 0;
341
- }