@novastera-oss/llamarn 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/build-info.cpp +2 -2
  14. package/cpp/llama.cpp/README.md +11 -3
  15. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  17. package/cpp/llama.cpp/common/arg.cpp +153 -113
  18. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  19. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  20. package/cpp/llama.cpp/common/chat.cpp +847 -699
  21. package/cpp/llama.cpp/common/chat.h +73 -6
  22. package/cpp/llama.cpp/common/common.cpp +50 -82
  23. package/cpp/llama.cpp/common/common.h +21 -17
  24. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  25. package/cpp/llama.cpp/common/json-partial.h +37 -0
  26. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  27. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  28. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  29. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  30. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  31. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  32. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  33. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  34. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  35. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  36. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  37. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  74. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  120. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  121. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  122. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  123. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  124. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  125. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  126. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  127. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  128. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  129. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  130. package/cpp/llama.cpp/include/llama.h +62 -125
  131. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  132. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  133. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  150. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  159. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  160. package/cpp/llama.cpp/models/templates/README.md +2 -0
  161. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  162. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  163. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  164. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  165. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  166. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  167. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  168. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  169. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  170. package/cpp/llama.cpp/src/llama-context.h +30 -0
  171. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  172. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  173. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  174. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  175. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  176. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  177. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  178. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  179. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  180. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  181. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  182. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  183. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  184. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  185. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  186. package/cpp/llama.cpp/src/llama-model.h +6 -1
  187. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  188. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  189. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  190. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  191. package/cpp/llama.cpp/src/llama.cpp +14 -0
  192. package/cpp/rn-completion.cpp +4 -2
  193. package/ios/include/chat.h +73 -6
  194. package/ios/include/common/minja/chat-template.hpp +9 -5
  195. package/ios/include/common/minja/minja.hpp +69 -36
  196. package/ios/include/common.h +21 -17
  197. package/ios/include/llama.h +62 -125
  198. package/ios/libs/llama.xcframework/Info.plist +19 -19
  199. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  200. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  201. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  205. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  206. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  227. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  228. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  229. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  240. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  241. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  242. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  246. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  247. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  253. package/package.json +1 -1
  254. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  255. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  256. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  257. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -0,0 +1,379 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "llama-cparams.h"
5
+
6
+ #include <bitset>
7
+ #include <cassert>
8
+ #include <vector>
9
+ #include <set>
10
+
11
+ // meta information about KV cells that can be part of multiple sequences at the same time
12
+ // TODO: add unit tests
13
+ class llama_kv_cells_unified {
14
+ public:
15
+ void reset() {
16
+ for (uint32_t i = 0; i < pos.size(); ++i) {
17
+ pos[i] = -1;
18
+ shift[i] = 0;
19
+ seq[i].reset();
20
+ }
21
+
22
+ has_shift = false;
23
+
24
+ used.clear();
25
+
26
+ for (uint32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
27
+ seq_pos[s].clear();
28
+ }
29
+ }
30
+
31
+ void reset_shift() {
32
+ has_shift = false;
33
+
34
+ for (uint32_t i = 0; i < shift.size(); ++i) {
35
+ shift[i] = 0;
36
+ }
37
+ }
38
+
39
+ uint32_t size() const {
40
+ return pos.size();
41
+ }
42
+
43
+ void resize(uint32_t n) {
44
+ pos.resize(n);
45
+ shift.resize(n);
46
+ seq.resize(n);
47
+
48
+ reset();
49
+ }
50
+
51
+ bool is_empty(uint32_t i) const {
52
+ assert(i < pos.size());
53
+ assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
54
+
55
+ return pos[i] == -1;
56
+ }
57
+
58
+ uint32_t get_used() const {
59
+ return used.size();
60
+ }
61
+
62
+ // the index of the first cell that is used
63
+ // return 0 if no cells are used
64
+ uint32_t used_min() const {
65
+ return used.empty() ? 0 : *used.begin();
66
+ }
67
+
68
+ // the index of the last cell that is used + 1
69
+ // return 0 if no cells are used
70
+ uint32_t used_max_p1() const {
71
+ #if 0
72
+ if (!seq_pos[0].empty()) printf("kv_cells: min[0] = %5d, max[0] = %5d\n", *seq_pos[0].begin(), *seq_pos[0].rbegin());
73
+ if (!seq_pos[1].empty()) printf("kv_cells: min[1] = %5d, max[1] = %5d\n", *seq_pos[1].begin(), *seq_pos[1].rbegin());
74
+ if (!seq_pos[2].empty()) printf("kv_cells: min[2] = %5d, max[2] = %5d\n", *seq_pos[2].begin(), *seq_pos[2].rbegin());
75
+ #endif
76
+
77
+ return used.empty() ? 0 : *used.rbegin() + 1;
78
+ }
79
+
80
+ bool get_has_shift() const {
81
+ return has_shift;
82
+ }
83
+
84
+ // move cell isrc to idst (used during defrag)
85
+ void mv(uint32_t isrc, uint32_t idst) {
86
+ assert(isrc < pos.size());
87
+ assert(idst < pos.size());
88
+
89
+ pos [idst] = pos [isrc];
90
+ shift[idst] = shift[isrc];
91
+ seq [idst] = seq [isrc];
92
+
93
+ pos [isrc] = -1;
94
+ shift[isrc] = 0;
95
+ seq [isrc].reset();
96
+
97
+ used.erase (isrc);
98
+ used.insert(idst);
99
+ }
100
+
101
+ // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
102
+ llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
103
+ assert(i + n <= pos.size());
104
+
105
+ llama_kv_cells_unified res;
106
+
107
+ res.resize(n);
108
+
109
+ for (uint32_t j = 0; j < n; ++j) {
110
+ res.pos[j] = pos[i + j];
111
+ res.seq[j] = seq[i + j];
112
+
113
+ assert(shift[i + j] == 0);
114
+ }
115
+
116
+ return res;
117
+ }
118
+
119
+ // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
120
+ void set(uint32_t i, const llama_kv_cells_unified & other) {
121
+ assert(i + other.pos.size() <= pos.size());
122
+
123
+ for (uint32_t j = 0; j < other.pos.size(); ++j) {
124
+ if (pos[i + j] == -1 && other.pos[j] != -1) {
125
+ used.insert(i + j);
126
+ }
127
+
128
+ if (pos[i + j] != -1 && other.pos[j] == -1) {
129
+ used.erase(i + j);
130
+ }
131
+
132
+ if (pos[i + j] != -1) {
133
+ seq_pos_rm(i + j);
134
+ }
135
+
136
+ pos[i + j] = other.pos[j];
137
+ seq[i + j] = other.seq[j];
138
+
139
+ if (pos[i + j] != -1) {
140
+ seq_pos_add(i + j);
141
+ }
142
+
143
+ assert(shift[i + j] == 0);
144
+ }
145
+ }
146
+
147
+ // note: call only if the cell has seq_id
148
+ // return true if the cell becomes empty
149
+ bool seq_rm(uint32_t i, llama_seq_id seq_id) {
150
+ assert(i < pos.size());
151
+ assert(seq[i].test(seq_id));
152
+ assert(pos[i] != -1);
153
+ assert(seq_id >= 0);
154
+
155
+ seq[i].reset(seq_id);
156
+ seq_pos[seq_id].erase(pos[i]);
157
+
158
+ if (seq[i].none()) {
159
+ pos[i] = -1;
160
+
161
+ used.erase(i);
162
+
163
+ return true;
164
+ }
165
+
166
+ return false;
167
+ }
168
+
169
+ // return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
170
+ bool seq_keep(uint32_t i, llama_seq_id seq_id) {
171
+ assert(i < pos.size());
172
+
173
+ if (seq[i].test(seq_id)) {
174
+ seq_pos_rm(i);
175
+ seq[i].reset();
176
+
177
+ seq[i].set(seq_id);
178
+ seq_pos[seq_id].insert(pos[i]);
179
+
180
+ return false;
181
+ }
182
+
183
+ if (seq[i].any()) {
184
+ seq_pos_rm(i);
185
+ seq[i].reset();
186
+
187
+ pos[i] = -1;
188
+
189
+ used.erase(i);
190
+
191
+ return true;
192
+ }
193
+
194
+ assert(pos[i] == -1);
195
+
196
+ return false;
197
+ }
198
+
199
+ bool seq_has(uint32_t i, llama_seq_id seq_id) const {
200
+ assert(i < pos.size());
201
+ assert(seq_id >= 0);
202
+
203
+ return seq[i].test(seq_id);
204
+ }
205
+
206
+ // note: call only if the cell is not empty and the seq_id is not in the cell
207
+ void seq_add(uint32_t i, llama_seq_id seq_id) {
208
+ assert(i < pos.size());
209
+ assert(pos[i] != -1);
210
+ assert(!seq[i].test(seq_id));
211
+
212
+ seq[i].set(seq_id);
213
+ seq_pos[seq_id].insert(pos[i]);
214
+ }
215
+
216
+ // the minimum position of sequence seq_id currently present in any of the cells
217
+ // return -1 if the sequence is not present
218
+ llama_pos seq_pos_min(llama_seq_id seq_id) const {
219
+ assert(seq_id >= 0);
220
+ assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
221
+
222
+ if (seq_pos[seq_id].empty()) {
223
+ return -1;
224
+ }
225
+
226
+ return *seq_pos[seq_id].begin();
227
+ }
228
+
229
+ // the maximum position of sequence seq_id currently present in any of the cells
230
+ // return -1 if the sequence is not present
231
+ llama_pos seq_pos_max(llama_seq_id seq_id) const {
232
+ assert(seq_id >= 0);
233
+ assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
234
+
235
+ if (seq_pos[seq_id].empty()) {
236
+ return -1;
237
+ }
238
+
239
+ return *seq_pos[seq_id].rbegin();
240
+ }
241
+
242
+ // note: call only if the cell is not empty
243
+ llama_pos pos_get(uint32_t i) const {
244
+ assert(i < pos.size());
245
+ assert(pos[i] != -1);
246
+
247
+ return pos[i];
248
+ }
249
+
250
+ // note: call only if the cell is not empty
251
+ llama_pos get_shift(uint32_t i) const {
252
+ assert(i < pos.size());
253
+ assert(pos[i] != -1);
254
+
255
+ return shift[i];
256
+ }
257
+
258
+ // check if a cell is not empty and its position is within [p0, p1)
259
+ bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
260
+ assert(i < pos.size());
261
+
262
+ return pos[i] >= p0 && pos[i] < p1;
263
+ }
264
+
265
+ // set the position of an empty cell
266
+ // does not modify "has_shift"
267
+ // note: call only if the cell is empty
268
+ void pos_set(uint32_t i, llama_pos p) {
269
+ assert(i < pos.size());
270
+ assert(pos[i] == -1);
271
+
272
+ pos[i] = p;
273
+
274
+ used.insert(i);
275
+ }
276
+
277
+ // pos[i] = pos[i] + d
278
+ // sets "has_shift" to true
279
+ // note: call only if the cell is not empty
280
+ bool pos_add(uint32_t i, llama_pos d) {
281
+ assert(i < pos.size());
282
+ assert(pos[i] != -1);
283
+
284
+ seq_pos_rm(i);
285
+
286
+ pos[i] += d;
287
+ shift[i] += d;
288
+
289
+ seq_pos_add(i);
290
+
291
+ has_shift = true;
292
+
293
+ if (pos[i] < 0) {
294
+ seq_pos_rm(i);
295
+
296
+ seq[i].reset();
297
+ pos[i] = -1;
298
+
299
+ used.erase(i);
300
+
301
+ return true;
302
+ }
303
+
304
+ return false;
305
+ }
306
+
307
+ // pos[i] = pos[i] / d
308
+ // sets "has_shift" to true
309
+ // note: call only if the cell is not empty
310
+ void pos_div(uint32_t i, int d) {
311
+ assert(i < pos.size());
312
+ assert(pos[i] != -1);
313
+
314
+ const llama_pos p_old = pos[i];
315
+
316
+ seq_pos_rm(i);
317
+
318
+ pos[i] /= d;
319
+ shift[i] += p_old - pos[i];
320
+
321
+ seq_pos_add(i);
322
+
323
+ has_shift = true;
324
+ }
325
+
326
+ private:
327
+ bool has_shift = false;
328
+
329
+ // set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
330
+ std::set<uint32_t> used;
331
+
332
+ std::vector<llama_pos> pos;
333
+
334
+ // this array accumulates any applied shifts to the pos array since the last reset_shift() call
335
+ // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
336
+ //
337
+ // cells.pos_add(x, shift_x);
338
+ // cells.pos_div(y, shift_y);
339
+ // ...
340
+ //
341
+ // if (cells.has_shift()) {
342
+ // for (int i = 0; i < n; ++i) {
343
+ // auto shift_i = cells.get_shift(i);
344
+ // ...
345
+ // }
346
+ // cells.reset_shift();
347
+ // }
348
+ //
349
+ std::vector<llama_pos> shift;
350
+
351
+ using bits_t = std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>;
352
+
353
+ // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
354
+ std::vector<bits_t> seq;
355
+
356
+ // the set seq_pos[s] tells us which positions are currently present for sequence s
357
+ // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
358
+ std::set<llama_pos> seq_pos[LLAMA_MAX_PARALLEL_SEQUENCES];
359
+
360
+ // helper functions for updating `seq_pos`, once cell at a time:
361
+
362
+ // remove cell i
363
+ void seq_pos_rm(uint32_t i) {
364
+ for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
365
+ if (seq[i].test(s)) {
366
+ seq_pos[s].erase(pos[i]);
367
+ }
368
+ }
369
+ }
370
+
371
+ // add cell i
372
+ void seq_pos_add(uint32_t i) {
373
+ for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
374
+ if (seq[i].test(s)) {
375
+ seq_pos[s].insert(pos[i]);
376
+ }
377
+ }
378
+ }
379
+ };
@@ -7,8 +7,8 @@ struct llama_memory_params {
7
7
  ggml_type type_k;
8
8
  ggml_type type_v;
9
9
 
10
- // parameters for other types of memory
11
- // ...
10
+ // use full-size SWA cache
11
+ bool swa_full;
12
12
  };
13
13
 
14
14
  // general concept of LLM memory
@@ -22,9 +22,10 @@ public:
22
22
  virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
23
23
  virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
24
24
  virtual void seq_keep(llama_seq_id seq_id) = 0;
25
- virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0;
25
+ virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) = 0;
26
26
  virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
27
27
 
28
+ virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
28
29
  virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
29
30
 
30
31
  virtual bool get_can_edit() const = 0;
@@ -301,12 +301,12 @@ namespace GGUFMeta {
301
301
  GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
302
302
 
303
303
  switch (arr_info.gt) {
304
- case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
305
- case GGUF_TYPE_INT32: GGML_ASSERT(
306
- (std::is_same<T, int32_t>::value) ||
307
- (std::is_same<T, uint32_t>::value)); break;
304
+ case GGUF_TYPE_UINT32:
305
+ case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
306
+ (std::is_same<T, uint32_t>::value)); break;
307
+ case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
308
308
  default:
309
- throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
309
+ throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
310
310
  }
311
311
 
312
312
  result.resize(arr_info.length);
@@ -330,12 +330,12 @@ namespace GGUFMeta {
330
330
  GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
331
331
 
332
332
  switch (arr_info.gt) {
333
- case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
334
- case GGUF_TYPE_INT32: GGML_ASSERT(
335
- (std::is_same<T, int32_t>::value) ||
336
- (std::is_same<T, uint32_t>::value)); break;
333
+ case GGUF_TYPE_UINT32:
334
+ case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
335
+ (std::is_same<T, uint32_t>::value)); break;
336
+ case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
337
337
  default:
338
- throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
338
+ throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
339
339
  }
340
340
 
341
341
  if (arr_info.length > N_MAX) {
@@ -469,7 +469,7 @@ llama_model_loader::llama_model_loader(
469
469
 
470
470
  meta.reset(gguf_init_from_file(fname.c_str(), params));
471
471
  if (!meta) {
472
- throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
472
+ throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
473
473
  }
474
474
 
475
475
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
@@ -528,7 +528,7 @@ llama_model_loader::llama_model_loader(
528
528
  };
529
529
  gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
530
530
  if (!ctx_gguf) {
531
- throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
531
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
532
532
  }
533
533
 
534
534
  // check idx
@@ -822,13 +822,18 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
822
822
  mappings.reserve(files.size());
823
823
  mmaps_used.reserve(files.size());
824
824
  for (const auto & file : files) {
825
- auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
826
- if (!reg) {
827
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
825
+ bool is_numa = false;
826
+
827
+ auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
828
+ if (dev) {
829
+ auto * reg = ggml_backend_dev_backend_reg(dev);
830
+ auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
831
+ if (is_numa_fn) {
832
+ is_numa = is_numa_fn();
833
+ }
828
834
  }
829
835
 
830
- auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
831
- std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
836
+ std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
832
837
  mmaps_used.emplace_back(mapping->size(), 0);
833
838
  if (mlock_mmaps) {
834
839
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());