@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -1,6 +1,80 @@
1
1
  #include "LlamaCompletionWorker.h"
2
2
  #include "LlamaContext.h"
3
3
 
4
+ // Computes FNV-1a hash of the data
5
+ static std::string fnv_hash(const uint8_t * data, size_t len) {
6
+ const uint64_t fnv_prime = 0x100000001b3ULL;
7
+ uint64_t hash = 0xcbf29ce484222325ULL;
8
+
9
+ for (size_t i = 0; i < len; ++i) {
10
+ hash ^= data[i];
11
+ hash *= fnv_prime;
12
+ }
13
+ return std::to_string(hash);
14
+ }
15
+
16
+ static const std::string base64_chars =
17
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
18
+ "abcdefghijklmnopqrstuvwxyz"
19
+ "0123456789+/";
20
+
21
+ // Base64 decoding function
22
+ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
23
+ std::vector<uint8_t> decoded;
24
+ int in_len = encoded_string.size();
25
+ int i = 0;
26
+ int j = 0;
27
+ int in_ = 0;
28
+ unsigned char char_array_4[4], char_array_3[3];
29
+
30
+ while (in_len-- && (encoded_string[in_] != '=')) {
31
+ if (isspace(encoded_string[in_])) {
32
+ in_++;
33
+ continue;
34
+ }
35
+
36
+ if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
37
+ break;
38
+ }
39
+
40
+ char_array_4[i++] = encoded_string[in_]; in_++;
41
+ if (i == 4) {
42
+ for (i = 0; i < 4; i++) {
43
+ char_array_4[i] = base64_chars.find(char_array_4[i]);
44
+ }
45
+
46
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
47
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
48
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
49
+
50
+ for (i = 0; i < 3; i++) {
51
+ decoded.push_back(char_array_3[i]);
52
+ }
53
+ i = 0;
54
+ }
55
+ }
56
+
57
+ if (i) {
58
+ for (j = i; j < 4; j++) {
59
+ char_array_4[j] = 0;
60
+ }
61
+
62
+ for (j = 0; j < 4; j++) {
63
+ char_array_4[j] = base64_chars.find(char_array_4[j]);
64
+ }
65
+
66
+ char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
67
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
68
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
69
+
70
+ for (j = 0; j < i - 1; j++) {
71
+ decoded.push_back(char_array_3[j]);
72
+ }
73
+ }
74
+
75
+ return decoded;
76
+ }
77
+
4
78
  size_t common_part(const std::vector<llama_token> &a,
5
79
  const std::vector<llama_token> &b) {
6
80
  size_t i = 0;
@@ -10,6 +84,230 @@ size_t common_part(const std::vector<llama_token> &a,
10
84
  return i;
11
85
  }
12
86
 
87
+ // Process images and add them to the tokenized input
88
+ llama_pos processImage(
89
+ const mtmd_context* mtmd_ctx,
90
+ llama_context* ctx,
91
+ LlamaSessionPtr sess,
92
+ const std::vector<std::string>& image_paths,
93
+ const common_params& params,
94
+ std::vector<llama_token>& text_tokens
95
+ ) {
96
+ if (mtmd_ctx == nullptr) {
97
+ return false;
98
+ }
99
+
100
+ // Multimodal path
101
+ std::string full_prompt = params.prompt;
102
+ // Add image marker if it doesn't already exist
103
+ if (full_prompt.find("<__image__>") == std::string::npos) {
104
+ full_prompt += " <__image__>";
105
+ }
106
+
107
+ // Prepare bitmaps array for all images
108
+ mtmd::bitmaps bitmaps;
109
+
110
+ // Load all images
111
+ for (const auto& image_path : image_paths) {
112
+ fprintf(stdout, "[DEBUG] Loading image: %s\n",
113
+ image_path.substr(0, 50).c_str()); // Only log part of path for base64
114
+
115
+ // Check if it's a base64 image
116
+ if (image_path.compare(0, 11, "data:image/") == 0) {
117
+
118
+ // Parse base64 data
119
+ std::vector<std::string> parts;
120
+ size_t comma_pos = image_path.find(',');
121
+ if (comma_pos == std::string::npos) {
122
+ bitmaps.entries.clear();
123
+ return false;
124
+ }
125
+
126
+ std::string header = image_path.substr(0, comma_pos);
127
+ std::string base64_data = image_path.substr(comma_pos + 1);
128
+
129
+ if (header.find("base64") == std::string::npos) {
130
+ bitmaps.entries.clear();
131
+ return false;
132
+ }
133
+
134
+ // Decode base64
135
+ try {
136
+ // Decode base64 to binary
137
+ std::vector<uint8_t> image_data = base64_decode(base64_data);
138
+
139
+ // Load bitmap from memory buffer using direct initialization
140
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_data.data(), image_data.size()));
141
+ if (!bmp.ptr) {
142
+ bitmaps.entries.clear();
143
+ return false;
144
+ }
145
+
146
+ // Calculate bitmap hash (for KV caching)
147
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
148
+ bmp.set_id(hash.c_str());
149
+ bitmaps.entries.push_back(std::move(bmp));
150
+ } catch (const std::exception& e) {
151
+ bitmaps.entries.clear();
152
+ return false;
153
+ }
154
+ } else if (image_path.compare(0, 7, "http://") == 0 || image_path.compare(0, 8, "https://") == 0) {
155
+ // HTTP URLs are not supported yet
156
+ bitmaps.entries.clear();
157
+ return false;
158
+ } else {
159
+ // Check if file exists
160
+ FILE* file = fopen(image_path.c_str(), "rb");
161
+ if (file == nullptr) {
162
+ bitmaps.entries.clear();
163
+ return false;
164
+ }
165
+
166
+ // Get file size
167
+ fseek(file, 0, SEEK_END);
168
+ long file_size = ftell(file);
169
+ fseek(file, 0, SEEK_SET);
170
+ fclose(file);
171
+
172
+ // Create bitmap directly
173
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(image_path.c_str()));
174
+ if (!bmp.ptr) {
175
+ bitmaps.entries.clear();
176
+ return false;
177
+ }
178
+
179
+ // Calculate bitmap hash (for KV caching)
180
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
181
+ bmp.set_id(hash.c_str());
182
+ bitmaps.entries.push_back(std::move(bmp));
183
+ }
184
+ }
185
+
186
+ mtmd_input_chunks* chunks = mtmd_input_chunks_init();
187
+ if (chunks == nullptr) {
188
+ bitmaps.entries.clear();
189
+ return false;
190
+ }
191
+
192
+ // Create input text
193
+ mtmd_input_text input_text;
194
+ input_text.text = full_prompt.c_str(); // Use the full prompt with image marker
195
+ input_text.add_special = true; // Add BOS token if this is the first message
196
+ input_text.parse_special = true; // Parse special tokens like <__image__>
197
+
198
+ // Tokenize the text and images
199
+ fprintf(stdout, "[DEBUG] Tokenizing text and %zu images\n", bitmaps.entries.size());
200
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
201
+
202
+ // Cast away const for mtmd_tokenize
203
+ int32_t res = mtmd_tokenize(
204
+ const_cast<mtmd_context*>(mtmd_ctx),
205
+ chunks,
206
+ &input_text,
207
+ bitmaps_c_ptr.data(),
208
+ bitmaps_c_ptr.size()
209
+ );
210
+
211
+ if (res != 0) {
212
+ mtmd_input_chunks_free(chunks);
213
+ bitmaps.entries.clear();
214
+ return false;
215
+ }
216
+
217
+ // Log chunk information
218
+ size_t num_chunks = mtmd_input_chunks_size(chunks);
219
+ fprintf(stdout, "[DEBUG] Tokenization successful: num_chunks=%zu\n", num_chunks);
220
+
221
+ // Clear text_tokens before adding new tokens
222
+ text_tokens.clear();
223
+
224
+ // Create a vector to store all tokens (both text and image)
225
+ std::vector<llama_token> all_tokens;
226
+
227
+ // Track the total number of tokens (both text and image)
228
+ size_t total_token_count = 0;
229
+
230
+ // chunk pos
231
+ std::vector<size_t> chunk_pos;
232
+ for (size_t i = 0; i < num_chunks; i++) {
233
+ chunk_pos.push_back(total_token_count);
234
+
235
+ const mtmd_input_chunk* chunk = mtmd_input_chunks_get(chunks, i);
236
+ mtmd_input_chunk_type chunk_type = mtmd_input_chunk_get_type(chunk);
237
+
238
+ if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
239
+ size_t n_tokens;
240
+ const llama_token* tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
241
+
242
+ // Add text tokens
243
+ text_tokens.insert(text_tokens.end(), tokens, tokens + n_tokens);
244
+ all_tokens.insert(all_tokens.end(), tokens, tokens + n_tokens);
245
+ total_token_count += n_tokens;
246
+ } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
247
+ const mtmd_image_tokens* img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
248
+ size_t n_tokens = mtmd_image_tokens_get_n_tokens(img_tokens);
249
+ size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
250
+
251
+ for (size_t j = 0; j < n_pos; j++) {
252
+ all_tokens.push_back(LLAMA_TOKEN_NULL);
253
+ }
254
+ total_token_count += n_pos;
255
+ }
256
+ }
257
+
258
+ llama_pos n_past = common_part(*sess->tokens_ptr(), all_tokens);
259
+
260
+ llama_pos new_n_past = n_past;
261
+
262
+ for (size_t i = 0; i < chunk_pos.size(); i++) {
263
+ fprintf(stdout, "[DEBUG] Evaluating chunk %zu: n_past=%d, chunk_pos=%zu\n", i, n_past, chunk_pos[i]);
264
+
265
+ // Process chunk only if it's after the current n_past
266
+ if (chunk_pos[i] >= new_n_past) {
267
+ bool chunk_logits_last = (i == num_chunks - 1);
268
+ auto chunk = mtmd_input_chunks_get(chunks, i);
269
+
270
+ // Cast away const for mtmd_helper_eval_chunk_single
271
+ int32_t res = mtmd_helper_eval_chunk_single(
272
+ const_cast<mtmd_context*>(mtmd_ctx),
273
+ ctx,
274
+ chunk,
275
+ n_past,
276
+ 0,
277
+ params.n_batch, // batch size
278
+ chunk_logits_last,
279
+ &new_n_past
280
+ );
281
+
282
+ if (res != 0) {
283
+ mtmd_input_chunks_free(chunks);
284
+ bitmaps.entries.clear();
285
+ return false;
286
+ }
287
+ n_past = new_n_past;
288
+ }
289
+ }
290
+
291
+ if (n_past == total_token_count) {
292
+ // we have to evaluate at least 1 token to generate logits.
293
+ n_past--;
294
+ }
295
+
296
+ // Update sampling context to process token sequences
297
+ for (auto & token : all_tokens) {
298
+ if (token == LLAMA_TOKEN_NULL) {
299
+ continue;
300
+ }
301
+ }
302
+ // Set the tokens
303
+ sess->set_tokens(std::move(all_tokens));
304
+
305
+ // Clean up image resources
306
+ mtmd_input_chunks_free(chunks);
307
+ bitmaps.entries.clear();
308
+ return n_past;
309
+ }
310
+
13
311
  size_t findStoppingStrings(const std::string &text,
14
312
  const size_t last_token_size,
15
313
  const std::vector<std::string> &stop_words) {
@@ -36,9 +334,11 @@ LlamaCompletionWorker::LlamaCompletionWorker(
36
334
  const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
37
335
  Napi::Function callback, common_params params,
38
336
  std::vector<std::string> stop_words,
39
- int32_t chat_format)
337
+ int32_t chat_format,
338
+ std::vector<std::string> image_paths)
40
339
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
41
- _params(params), _stop_words(stop_words), _chat_format(chat_format) {
340
+ _params(params), _stop_words(stop_words), _chat_format(chat_format),
341
+ _image_paths(image_paths) {
42
342
  if (!callback.IsEmpty()) {
43
343
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
44
344
  "LlamaCompletionCallback", 0, 1);
@@ -70,18 +370,59 @@ void LlamaCompletionWorker::Execute() {
70
370
  LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
71
371
  common_sampler_free};
72
372
 
73
- std::vector<llama_token> prompt_tokens =
74
- ::common_tokenize(ctx, _params.prompt, add_bos);
75
- n_input = prompt_tokens.size();
76
- if (_sess->tokens_ptr()->size() > 0) {
77
- n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
78
- if (n_cur == n_input) {
79
- --n_cur;
373
+ std::vector<llama_token> prompt_tokens;
374
+
375
+ // Process images if any are provided
376
+ if (!_image_paths.empty()) {
377
+ const auto* mtmd_ctx = _sess->get_mtmd_ctx();
378
+
379
+ if (mtmd_ctx != nullptr) {
380
+ // Process the images and get the tokens
381
+ n_cur = processImage(
382
+ mtmd_ctx,
383
+ ctx,
384
+ _sess,
385
+ _image_paths,
386
+ _params,
387
+ prompt_tokens
388
+ );
389
+
390
+ if (n_cur <= 0) {
391
+ SetError("Failed to process images");
392
+ _sess->get_mutex().unlock();
393
+ return;
394
+ }
395
+
396
+ fprintf(stdout, "[DEBUG] Image processing successful, n_cur=%zu, tokens=%zu\n",
397
+ n_cur, _sess->tokens_ptr()->size());
398
+
399
+ n_input = _sess->tokens_ptr()->size();
400
+ if (n_cur == n_input) {
401
+ --n_cur;
402
+ }
403
+ n_input -= n_cur;
404
+ llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
405
+ } else {
406
+ SetError("Multimodal context not initialized");
407
+ _sess->get_mutex().unlock();
408
+ return;
409
+ }
410
+ } else {
411
+ // Text-only path
412
+ prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
413
+ n_input = prompt_tokens.size();
414
+
415
+ if (_sess->tokens_ptr()->size() > 0) {
416
+ n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
417
+ if (n_cur == n_input) {
418
+ --n_cur;
419
+ }
420
+ n_input -= n_cur;
421
+ llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
80
422
  }
81
- n_input -= n_cur;
82
- llama_kv_cache_seq_rm(ctx, 0, n_cur, -1);
423
+ // Set the tokens
424
+ _sess->set_tokens(std::move(prompt_tokens));
83
425
  }
84
- _sess->set_tokens(std::move(prompt_tokens));
85
426
 
86
427
  const int max_len = _params.n_predict < 0 ? 0 : _params.n_predict;
87
428
  _sess->tokens_ptr()->reserve(_sess->tokens_ptr()->size() + max_len);
@@ -90,11 +431,17 @@ void LlamaCompletionWorker::Execute() {
90
431
  for (int i = 0; i < max_len || _stop; i++) {
91
432
  // check if we need to remove some tokens
92
433
  if (embd->size() >= _params.n_ctx) {
434
+ if (!_params.ctx_shift) {
435
+ // Context is full and ctx_shift is disabled, so we need to stop
436
+ _result.context_full = true;
437
+ break;
438
+ }
439
+
93
440
  const int n_left = n_cur - n_keep - 1;
94
441
  const int n_discard = n_left / 2;
95
442
 
96
- llama_kv_cache_seq_rm(ctx, 0, n_keep + 1, n_keep + n_discard + 1);
97
- llama_kv_cache_seq_add(ctx, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
443
+ llama_kv_self_seq_rm(ctx, 0, n_keep + 1, n_keep + n_discard + 1);
444
+ llama_kv_self_seq_add(ctx, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
98
445
 
99
446
  // shift the tokens
100
447
  embd->insert(embd->begin() + n_keep + 1,
@@ -104,12 +451,18 @@ void LlamaCompletionWorker::Execute() {
104
451
  n_cur -= n_discard;
105
452
  _result.truncated = true;
106
453
  }
107
- int ret = llama_decode(
108
- ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
109
- if (ret < 0) {
110
- SetError("Failed to decode token, code: " + std::to_string(ret));
111
- break;
454
+
455
+ // For multimodal input, n_past might already be set
456
+ // Only decode text tokens if we have any input left
457
+ if (n_input > 0) {
458
+ int ret = llama_decode(
459
+ ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
460
+ if (ret < 0) {
461
+ SetError("Failed to decode token, code: " + std::to_string(ret));
462
+ break;
463
+ }
112
464
  }
465
+
113
466
  // sample the next token
114
467
  const llama_token new_token_id =
115
468
  common_sampler_sample(sampling.get(), ctx, -1);
@@ -161,6 +514,8 @@ void LlamaCompletionWorker::OnOK() {
161
514
  _result.tokens_predicted));
162
515
  result.Set("truncated",
163
516
  Napi::Boolean::New(env, _result.truncated));
517
+ result.Set("context_full",
518
+ Napi::Boolean::New(env, _result.context_full));
164
519
  result.Set("text",
165
520
  Napi::String::New(env, _result.text.c_str()));
166
521
 
@@ -1,9 +1,16 @@
1
+ #pragma once
2
+
1
3
  #include "common.hpp"
4
+ #include <atomic>
2
5
  #include <functional>
6
+ #include <napi.h>
7
+ #include "tools/mtmd/mtmd.h"
8
+ #include "tools/mtmd/clip.h"
3
9
 
4
10
  struct CompletionResult {
5
11
  std::string text = "";
6
12
  bool truncated = false;
13
+ bool context_full = false;
7
14
  size_t tokens_predicted = 0;
8
15
  size_t tokens_evaluated = 0;
9
16
  };
@@ -13,28 +20,42 @@ class LlamaCompletionWorker : public Napi::AsyncWorker,
13
20
  public:
14
21
  LlamaCompletionWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
15
22
  Napi::Function callback, common_params params,
16
- std::vector<std::string> stop_words = {},
17
- int32_t chat_format = 0);
23
+ std::vector<std::string> stop_words,
24
+ int32_t chat_format,
25
+ std::vector<std::string> image_paths = {});
18
26
 
19
27
  ~LlamaCompletionWorker();
20
28
 
21
- inline void Stop() { _stop = true; }
29
+ Napi::Promise GetPromise() { return Napi::Promise::Deferred::Promise(); }
30
+
31
+ void OnComplete(std::function<void()> cb) {
32
+ _onComplete = cb;
33
+ }
22
34
 
23
- inline void onComplete(std::function<void()> cb) { _onComplete = cb; }
35
+ void SetStop() {
36
+ _stop = true;
37
+ }
24
38
 
25
39
  protected:
26
- void Execute();
27
- void OnOK();
28
- void OnError(const Napi::Error &err);
40
+ void Execute() override;
41
+ void OnOK() override;
42
+ void OnError(const Napi::Error &err) override;
29
43
 
30
44
  private:
31
45
  LlamaSessionPtr _sess;
32
46
  common_params _params;
33
47
  std::vector<std::string> _stop_words;
34
48
  int32_t _chat_format;
35
- Napi::ThreadSafeFunction _tsfn;
49
+ std::vector<std::string> _image_paths;
50
+ std::function<void()> _onComplete;
36
51
  bool _has_callback = false;
37
52
  bool _stop = false;
38
- std::function<void()> _onComplete;
39
- CompletionResult _result;
53
+ Napi::ThreadSafeFunction _tsfn;
54
+ struct {
55
+ size_t tokens_evaluated = 0;
56
+ size_t tokens_predicted = 0;
57
+ bool truncated = false;
58
+ bool context_full = false;
59
+ std::string text;
60
+ } _result;
40
61
  };