@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -61,7 +61,10 @@ extern "C" {
61
61
  struct llama_model;
62
62
  struct llama_context;
63
63
  struct llama_sampler;
64
- struct llama_kv_cache;
64
+
65
+ typedef struct llama_memory_i * llama_memory_t;
66
+
67
+ struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
65
68
 
66
69
  typedef int32_t llama_pos;
67
70
  typedef int32_t llama_token;
@@ -240,18 +243,21 @@ extern "C" {
240
243
 
241
244
  typedef bool (*llama_progress_callback)(float progress, void * user_data);
242
245
 
243
- // Input data for llama_decode
246
+ // Input data for llama_encode/llama_decode
244
247
  // A llama_batch object can contain input about one or many sequences
245
248
  // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
246
249
  //
247
250
  // - token : the token ids of the input (used when embd is NULL)
248
251
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
249
252
  // - pos : the positions of the respective token in the sequence
250
- // (if set to NULL, the token position will be tracked automatically by llama_decode)
253
+ // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
251
254
  // - seq_id : the sequence to which the respective token belongs
252
255
  // (if set to NULL, the sequence ID will be assumed to be 0)
253
256
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
254
- // (if set to NULL, only the logits for last token will be returned)
257
+ // (if set to NULL:
258
+ // - if embeddings: all tokens are output
259
+ // - if not: only the last token is output
260
+ // )
255
261
  //
256
262
  typedef struct llama_batch {
257
263
  int32_t n_tokens;
@@ -259,8 +265,8 @@ extern "C" {
259
265
  llama_token * token;
260
266
  float * embd;
261
267
  llama_pos * pos;
262
- int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
263
- llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
268
+ int32_t * n_seq_id;
269
+ llama_seq_id ** seq_id;
264
270
  int8_t * logits; // TODO: rename this to "output"
265
271
  } llama_batch;
266
272
 
@@ -384,6 +390,7 @@ extern "C" {
384
390
  void * imatrix; // pointer to importance matrix data
385
391
  void * kv_overrides; // pointer to vector containing overrides
386
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
387
394
  } llama_model_quantize_params;
388
395
 
389
396
  typedef struct llama_logit_bias {
@@ -493,9 +500,11 @@ extern "C" {
493
500
  DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
494
501
 
495
502
  LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
496
- LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
503
+ LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
497
504
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
498
505
 
506
+ DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
507
+
499
508
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
500
509
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
501
510
 
@@ -509,6 +518,13 @@ extern "C" {
509
518
  // Get the model's RoPE frequency scaling factor
510
519
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
511
520
 
521
+ // Returns the number of classifier outputs (only valid for classifier models)
522
+ // Undefined behavior for non-classifier models
523
+ LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
524
+
525
+ // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
526
+ LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
527
+
512
528
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
513
529
 
514
530
  LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -609,7 +625,81 @@ extern "C" {
609
625
  int32_t il_end);
610
626
 
611
627
  //
612
- // KV cache
628
+ // Memory
629
+ //
630
+
631
+ // Clear the memory contents
632
+ // If data == true, the data buffers will also be cleared together with the metadata
633
+ LLAMA_API void llama_memory_clear(
634
+ llama_memory_t mem,
635
+ bool data);
636
+
637
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
638
+ // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
639
+ // seq_id < 0 : match any sequence
640
+ // p0 < 0 : [0, p1]
641
+ // p1 < 0 : [p0, inf)
642
+ LLAMA_API bool llama_memory_seq_rm(
643
+ llama_memory_t mem,
644
+ llama_seq_id seq_id,
645
+ llama_pos p0,
646
+ llama_pos p1);
647
+
648
+ // Copy all tokens that belong to the specified sequence to another sequence
649
+ // p0 < 0 : [0, p1]
650
+ // p1 < 0 : [p0, inf)
651
+ LLAMA_API void llama_memory_seq_cp(
652
+ llama_memory_t mem,
653
+ llama_seq_id seq_id_src,
654
+ llama_seq_id seq_id_dst,
655
+ llama_pos p0,
656
+ llama_pos p1);
657
+
658
+ // Removes all tokens that do not belong to the specified sequence
659
+ LLAMA_API void llama_memory_seq_keep(
660
+ llama_memory_t mem,
661
+ llama_seq_id seq_id);
662
+
663
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
664
+ // p0 < 0 : [0, p1]
665
+ // p1 < 0 : [p0, inf)
666
+ LLAMA_API void llama_memory_seq_add(
667
+ llama_memory_t mem,
668
+ llama_seq_id seq_id,
669
+ llama_pos p0,
670
+ llama_pos p1,
671
+ llama_pos delta);
672
+
673
+ // Integer division of the positions by factor of `d > 1`
674
+ // p0 < 0 : [0, p1]
675
+ // p1 < 0 : [p0, inf)
676
+ LLAMA_API void llama_memory_seq_div(
677
+ llama_memory_t mem,
678
+ llama_seq_id seq_id,
679
+ llama_pos p0,
680
+ llama_pos p1,
681
+ int d);
682
+
683
+ // Returns the smallest position present in the memory for the specified sequence
684
+ // This is typically non-zero only for SWA caches
685
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
686
+ // Return -1 if the sequence is empty
687
+ LLAMA_API llama_pos llama_memory_seq_pos_min(
688
+ llama_memory_t mem,
689
+ llama_seq_id seq_id);
690
+
691
+ // Returns the largest position present in the memory for the specified sequence
692
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
693
+ // Return -1 if the sequence is empty
694
+ LLAMA_API llama_pos llama_memory_seq_pos_max(
695
+ llama_memory_t mem,
696
+ llama_seq_id seq_id);
697
+
698
+ // Check if the memory supports shifting
699
+ LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
700
+
701
+ //
702
+ // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
613
703
  //
614
704
 
615
705
  // Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -622,86 +712,95 @@ extern "C" {
622
712
  "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
623
713
 
624
714
  // Clear the KV cache - both cell info is erased and KV data is zeroed
625
- LLAMA_API void llama_kv_self_clear(
626
- struct llama_context * ctx);
715
+ DEPRECATED(LLAMA_API void llama_kv_self_clear(
716
+ struct llama_context * ctx),
717
+ "Use llama_memory_clear() instead");
627
718
 
628
719
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
629
720
  // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
630
721
  // seq_id < 0 : match any sequence
631
722
  // p0 < 0 : [0, p1]
632
723
  // p1 < 0 : [p0, inf)
633
- LLAMA_API bool llama_kv_self_seq_rm(
724
+ DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
634
725
  struct llama_context * ctx,
635
726
  llama_seq_id seq_id,
636
727
  llama_pos p0,
637
- llama_pos p1);
728
+ llama_pos p1),
729
+ "Use llama_memory_seq_rm() instead");
638
730
 
639
731
  // Copy all tokens that belong to the specified sequence to another sequence
640
732
  // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
641
733
  // p0 < 0 : [0, p1]
642
734
  // p1 < 0 : [p0, inf)
643
- LLAMA_API void llama_kv_self_seq_cp(
735
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
644
736
  struct llama_context * ctx,
645
737
  llama_seq_id seq_id_src,
646
738
  llama_seq_id seq_id_dst,
647
739
  llama_pos p0,
648
- llama_pos p1);
740
+ llama_pos p1),
741
+ "Use llama_memory_seq_cp() instead");
649
742
 
650
743
  // Removes all tokens that do not belong to the specified sequence
651
- LLAMA_API void llama_kv_self_seq_keep(
744
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
652
745
  struct llama_context * ctx,
653
- llama_seq_id seq_id);
746
+ llama_seq_id seq_id),
747
+ "Use llama_memory_seq_keep() instead");
654
748
 
655
749
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
656
750
  // If the KV cache is RoPEd, the KV data is updated accordingly:
657
751
  // - lazily on next llama_decode()
658
752
  // p0 < 0 : [0, p1]
659
753
  // p1 < 0 : [p0, inf)
660
- LLAMA_API void llama_kv_self_seq_add(
754
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
661
755
  struct llama_context * ctx,
662
756
  llama_seq_id seq_id,
663
757
  llama_pos p0,
664
758
  llama_pos p1,
665
- llama_pos delta);
759
+ llama_pos delta),
760
+ "Use llama_memory_seq_add() instead");
666
761
 
667
762
  // Integer division of the positions by factor of `d > 1`
668
763
  // If the KV cache is RoPEd, the KV data is updated accordingly:
669
764
  // - lazily on next llama_decode()
670
765
  // p0 < 0 : [0, p1]
671
766
  // p1 < 0 : [p0, inf)
672
- LLAMA_API void llama_kv_self_seq_div(
767
+ DEPRECATED(void llama_kv_self_seq_div(
673
768
  struct llama_context * ctx,
674
769
  llama_seq_id seq_id,
675
770
  llama_pos p0,
676
771
  llama_pos p1,
677
- int d);
772
+ int d),
773
+ "Use llama_memory_seq_div() instead");
678
774
 
679
775
  // Returns the smallest position present in the KV cache for the specified sequence
680
776
  // This is typically non-zero only for SWA caches
681
777
  // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
682
778
  // Return -1 if the sequence is empty
683
- LLAMA_API llama_pos llama_kv_self_seq_pos_min(
779
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
684
780
  struct llama_context * ctx,
685
- llama_seq_id seq_id);
781
+ llama_seq_id seq_id),
782
+ "Use llama_memory_seq_pos_min() instead");
686
783
 
687
784
  // Returns the largest position present in the KV cache for the specified sequence
688
785
  // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
689
786
  // Return -1 if the sequence is empty
690
- LLAMA_API llama_pos llama_kv_self_seq_pos_max(
787
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
691
788
  struct llama_context * ctx,
692
- llama_seq_id seq_id);
789
+ llama_seq_id seq_id),
790
+ "Use llama_memory_seq_pos_max() instead");
693
791
 
694
792
  // Defragment the KV cache
695
793
  // This will be applied:
696
794
  // - lazily on next llama_decode()
697
- LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
795
+ DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
698
796
  "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
699
797
 
700
798
  // Check if the context supports KV cache shifting
701
- LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
799
+ DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
800
+ "use llama_memory_can_shift() instead");
702
801
 
703
802
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
704
- LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
803
+ DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
705
804
  "simply remove this call, updates are applied lazily on the next llama_decode()");
706
805
 
707
806
  //
@@ -709,7 +808,7 @@ extern "C" {
709
808
  //
710
809
 
711
810
  // Returns the *actual* size in bytes of the state
712
- // (logits, embedding and kv_cache)
811
+ // (logits, embedding and memory)
713
812
  // Only use when saving the state, not when restoring it, otherwise the size may be too small.
714
813
  LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
715
814
  LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -765,12 +864,12 @@ extern "C" {
765
864
  size_t n_token_count),
766
865
  "use llama_state_save_file instead");
767
866
 
768
- // Get the exact size needed to copy the KV cache of a single sequence
867
+ // Get the exact size needed to copy the state of a single sequence
769
868
  LLAMA_API size_t llama_state_seq_get_size(
770
869
  struct llama_context * ctx,
771
870
  llama_seq_id seq_id);
772
871
 
773
- // Copy the KV cache of a single sequence into the specified buffer
872
+ // Copy the state of a single sequence into the specified buffer
774
873
  LLAMA_API size_t llama_state_seq_get_data(
775
874
  struct llama_context * ctx,
776
875
  uint8_t * dst,
@@ -836,21 +935,23 @@ extern "C" {
836
935
  // For encode-decoder contexts, processes the batch using the encoder.
837
936
  // Can store the encoder output internally for later use by the decoder's cross-attention layers.
838
937
  // 0 - success
839
- // < 0 - error. the KV cache state is restored to the state before this call
938
+ // < 0 - error. the memory state is restored to the state before this call
840
939
  LLAMA_API int32_t llama_encode(
841
940
  struct llama_context * ctx,
842
941
  struct llama_batch batch);
843
942
 
844
943
  // Process a batch of tokens.
845
- // Requires KV cache.
944
+ // Requires the context to have a memory.
846
945
  // For encode-decoder contexts, processes the batch using the decoder.
847
946
  // Positive return values does not mean a fatal error, but rather a warning.
848
- // Upon non-zero return values, the KV cache state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
849
950
  // 0 - success
850
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
851
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
852
953
  // -1 - invalid input batch
853
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
854
955
  LLAMA_API int32_t llama_decode(
855
956
  struct llama_context * ctx,
856
957
  struct llama_batch batch);
@@ -866,8 +967,8 @@ extern "C" {
866
967
  // Get the number of threads used for prompt and batch processing (multiple token).
867
968
  LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
868
969
 
869
- // Set whether the model is in embeddings mode or not
870
- // If true, embeddings will be returned but logits will not
970
+ // Set whether the context outputs embeddings or not
971
+ // TODO: rename to avoid confusion with llama_get_embeddings()
871
972
  LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
872
973
 
873
974
  // Set whether to use causal attention or not
@@ -916,7 +1017,7 @@ extern "C" {
916
1017
 
917
1018
  // Get the embeddings for a sequence id
918
1019
  // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
919
- // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
1020
+ // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
920
1021
  // otherwise: float[n_embd] (1-dimensional)
921
1022
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
922
1023
 
@@ -946,6 +1047,7 @@ extern "C" {
946
1047
 
947
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
948
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
949
1051
 
950
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
951
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -989,6 +1091,7 @@ extern "C" {
989
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
990
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
991
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
992
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
993
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
994
1097
  /// as plaintext. Does not insert a leading space.
@@ -0,0 +1,124 @@
1
+ {%- set today = strftime_now("%Y-%m-%d") %}
2
+ {%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information or when the user's request requires up-to-date or specific data, you must use the available tools to fetch the information. Do not hesitate to use tools whenever they can provide a more accurate or complete response. If no relevant tools are available, then clearly state that you don't have the information and avoid making up anything.
3
+
4
+ If the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\").
5
+ You are always very attentive to dates, and when asked about information at specific dates, you discard information that is at another date.
6
+ You follow these instructions in all languages, and always respond to the user in the language they use or request.
7
+ Next sections describe the capabilities that you have.
8
+
9
+ # WEB BROWSING INSTRUCTIONS
10
+
11
+ You cannot perform any web search or access internet to open URLs, links etc. If it seems like the user is expecting you to do so, you clarify the situation and ask the user to copy paste the text directly in the chat.
12
+
13
+ # MULTI-MODAL INSTRUCTIONS
14
+
15
+ You have the ability to read images, but you cannot generate images. You also cannot transcribe audio files or videos.
16
+ You cannot read nor transcribe audio files or videos.
17
+
18
+ # TOOL CALLING INSTRUCTIONS
19
+
20
+ You may have access to tools that you can use to fetch information or perform actions. You must use these tools in the following situations:
21
+
22
+ 1. When the request requires up-to-date information.
23
+ 2. When the request requires specific data that you do not have in your knowledge base.
24
+ 3. When the request involves actions that you cannot perform without tools.
25
+
26
+ Always prioritize using tools to provide the most accurate and helpful response. If tools are not available, inform the user that you cannot perform the requested action at the moment." %}
27
+
28
+ {{- bos_token }}
29
+
30
+ {%- set system_prompt = default_system_message %}
31
+ {%- set loop_messages = messages %}
32
+
33
+ {%- if not tools is defined %}
34
+ {%- set tools = none %}
35
+ {%- endif %}
36
+
37
+ {%- if messages|length > 0 and messages[0]['role'] == 'system' %}
38
+ {%- if messages[0]['content'] is string %}
39
+ {%- set system_prompt = messages[0]['content'] %}
40
+ {%- else %}
41
+ {%- set system_prompt = messages[0]['content'][0]['text'] %}
42
+ {%- endif %}
43
+ {%- set loop_messages = messages[1:] %}
44
+ {%- endif %}
45
+
46
+ {%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
47
+
48
+ {%- set ns = namespace(index=0) %}
49
+ {%- for message in loop_messages %}
50
+ {%- if not (message.role == "tool" or (message.get('tool_calls'))) %}
51
+ {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}
52
+ {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
53
+ {%- endif %}
54
+ {%- set ns.index = ns.index + 1 %}
55
+ {%- endif %}
56
+ {%- endfor %}
57
+
58
+ {{- '[SYSTEM_PROMPT]' + system_prompt + '[/SYSTEM_PROMPT]' }}
59
+
60
+ {%- for message in loop_messages %}
61
+ {%- if message['role'] == 'system' %}
62
+ {%- if message['content'] is string %}
63
+ {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}
64
+ {%- else %}
65
+ {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}
66
+ {%- endif %}
67
+ {%- elif message['role'] == 'user' %}
68
+ {%- if tools is not none and (message == user_messages[-1]) %}
69
+ {{- '[AVAILABLE_TOOLS]' + tools|tojson + '[/AVAILABLE_TOOLS]' }}
70
+ {%- endif %}
71
+ {{- '[INST]' }}
72
+ {%- if message['content'] is string %}
73
+ {{- message['content'] }}
74
+ {%- else %}
75
+ {%- for block in message['content'] %}
76
+ {%- if block['type'] == 'text' %}
77
+ {{- block['text'] }}
78
+ {%- elif block['type'] in ['image', 'image_url'] %}
79
+ {{- '[IMG]' }}
80
+ {%- else %}
81
+ {{- raise_exception('Only text and image blocks are supported in message content!') }}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- endif %}
85
+ {{- '[/INST]' }}
86
+ {%- elif message['role'] == 'assistant' %}
87
+ {%- if message.get('tool_calls') %}
88
+ {%- for tool_call in message.tool_calls %}
89
+ {{- '[TOOL_CALLS]' + tool_call.function.name }}
90
+ {%- if not tool_call.id is defined or tool_call.id is not string or tool_call.id|length != 9 %}
91
+ {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
92
+ {%- endif %}
93
+ {{- '[CALL_ID]' + tool_call.id }}
94
+ {{- '[ARGS]' + tool_call['function']['arguments']|tojson }}
95
+ {%- endfor %}
96
+ {{- eos_token }}
97
+ {%- elif message['content'] is string %}
98
+ {{- message['content'] + eos_token }}
99
+ {%- else %}
100
+ {%- for block in message['content'] %}
101
+ {%- if block['type'] == 'text' %}
102
+ {{- block['text'] }}
103
+ {%- elif block['type'] in ['image', 'image_url'] %}
104
+ {{- '[IMG]' }}
105
+ {%- else %}
106
+ {{- raise_exception('Only text and image blocks are supported in assistant content!') }}
107
+ {%- endif %}
108
+ {%- endfor %}
109
+ {{- eos_token }}
110
+ {%- endif %}
111
+ {%- elif message['role'] == 'tool_results' or message['role'] == 'tool' %}
112
+ {%- if message.content is defined and message.content.content is defined %}
113
+ {%- set content = message.content.content %}
114
+ {%- else %}
115
+ {%- set content = message.content %}
116
+ {%- endif %}
117
+ {%- if not message.tool_call_id is defined or message.tool_call_id is not string or message['tool_call_id']|length != 9 %}
118
+ {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
119
+ {%- endif %}
120
+ {{- '[TOOL_RESULTS]' + message.tool_call_id + '[TOOL_CONTENT]' + content|string + '[/TOOL_RESULTS]' }}
121
+ {%- else %}
122
+ {{- raise_exception('Only system, user, assistant, and tool roles are supported!') }}
123
+ {%- endif %}
124
+ {%- endfor %}
@@ -1,2 +1,3 @@
1
1
  tabulate~=0.9.0
2
2
  GitPython~=3.1.43
3
+ matplotlib~=3.10.0
@@ -20,11 +20,11 @@ add_library(llama
20
20
  llama-hparams.cpp
21
21
  llama-impl.cpp
22
22
  llama-io.cpp
23
- llama-kv-cache.cpp
24
23
  llama-kv-cache-unified.cpp
25
24
  llama-kv-cache-unified-iswa.cpp
26
- llama-kv-cache-recurrent.cpp
27
25
  llama-memory.cpp
26
+ llama-memory-hybrid.cpp
27
+ llama-memory-recurrent.cpp
28
28
  llama-mmap.cpp
29
29
  llama-model-loader.cpp
30
30
  llama-model-saver.cpp