@novastera-oss/llamarn 0.2.9 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/proguard-rules.pro +12 -0
  3. package/android/src/main/cpp/include/llama.h +15 -47
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  22. package/cpp/llama.cpp/CMakePresets.json +11 -0
  23. package/cpp/llama.cpp/CODEOWNERS +1 -0
  24. package/cpp/llama.cpp/README.md +8 -8
  25. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  26. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  27. package/cpp/llama.cpp/common/arg.cpp +62 -1
  28. package/cpp/llama.cpp/common/chat.cpp +37 -20
  29. package/cpp/llama.cpp/common/chat.h +2 -0
  30. package/cpp/llama.cpp/common/common.cpp +22 -6
  31. package/cpp/llama.cpp/common/common.h +22 -4
  32. package/cpp/llama.cpp/convert_hf_to_gguf.py +1250 -43
  33. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +21 -13
  34. package/cpp/llama.cpp/ggml/CMakeLists.txt +13 -3
  35. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
  36. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  37. package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  38. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  39. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
  41. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -8
  42. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +44 -38
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +126 -8
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
  46. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +138 -18
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +11 -3
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1206 -163
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +36 -9
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +31 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +86 -17
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -64
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +47 -60
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +29 -42
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +46 -59
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -45
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +38 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +23 -36
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +255 -99
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -695
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +104 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +13 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +27 -6
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-impl.h +80 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  97. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +48 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +572 -106
  99. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +599 -105
  100. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +5 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +800 -42
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +191 -55
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  131. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +991 -307
  132. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +59 -12
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  138. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  139. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  140. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  141. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  142. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  143. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  144. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +17 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +18 -3
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +84 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
  173. package/cpp/llama.cpp/ggml/src/ggml.c +386 -67
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +307 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  177. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
  178. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
  179. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -47
  180. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  181. package/cpp/llama.cpp/include/llama.h +15 -47
  182. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
  183. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
  184. package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
  185. package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
  186. package/cpp/llama.cpp/src/llama-arch.cpp +316 -3
  187. package/cpp/llama.cpp/src/llama-arch.h +23 -1
  188. package/cpp/llama.cpp/src/llama-batch.cpp +103 -71
  189. package/cpp/llama.cpp/src/llama-batch.h +31 -18
  190. package/cpp/llama.cpp/src/llama-chat.cpp +58 -1
  191. package/cpp/llama.cpp/src/llama-chat.h +3 -0
  192. package/cpp/llama.cpp/src/llama-context.cpp +180 -106
  193. package/cpp/llama.cpp/src/llama-context.h +26 -16
  194. package/cpp/llama.cpp/src/llama-cparams.h +3 -2
  195. package/cpp/llama.cpp/src/llama-graph.cpp +310 -211
  196. package/cpp/llama.cpp/src/llama-graph.h +184 -122
  197. package/cpp/llama.cpp/src/llama-hparams.cpp +47 -1
  198. package/cpp/llama.cpp/src/llama-hparams.h +13 -2
  199. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  200. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  201. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  202. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  203. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  204. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  205. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  206. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +36 -11
  207. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  208. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  209. package/cpp/llama.cpp/src/llama-model.cpp +3545 -719
  210. package/cpp/llama.cpp/src/llama-model.h +21 -4
  211. package/cpp/llama.cpp/src/llama-quant.cpp +2 -2
  212. package/cpp/llama.cpp/src/llama-vocab.cpp +376 -10
  213. package/cpp/llama.cpp/src/llama-vocab.h +43 -0
  214. package/cpp/llama.cpp/src/unicode.cpp +207 -0
  215. package/cpp/llama.cpp/src/unicode.h +2 -0
  216. package/ios/include/chat.h +2 -0
  217. package/ios/include/common.h +22 -4
  218. package/ios/include/llama.h +15 -47
  219. package/ios/libs/llama.xcframework/Info.plist +13 -13
  220. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  221. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  223. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  224. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -47
  225. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  231. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  232. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3766
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -47
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -47
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -47
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  248. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
  250. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -47
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  254. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  255. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  261. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  262. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -4926
  263. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  264. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  265. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -47
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  267. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  268. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -4897
  269. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3794
  270. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  274. package/package.json +4 -4
  275. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  276. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  277. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  278. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  279. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  280. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -98,10 +98,20 @@ llama_context::llama_context(
98
98
  LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
99
99
  cparams.n_batch = GGML_KQ_MASK_PAD;
100
100
  }
101
-
102
101
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
103
102
 
104
103
  cparams.op_offload = params.op_offload;
104
+ cparams.kv_unified = params.kv_unified;
105
+
106
+ {
107
+ const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
108
+ supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
109
+
110
+ if (!supports_set_rows && !cparams.kv_unified) {
111
+ LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
112
+ cparams.kv_unified = true;
113
+ }
114
+ }
105
115
 
106
116
  const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
107
117
 
@@ -112,6 +122,7 @@ llama_context::llama_context(
112
122
  LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
113
123
  LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
114
124
  LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
125
+ LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
115
126
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
116
127
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
117
128
 
@@ -227,8 +238,8 @@ llama_context::llama_context(
227
238
 
228
239
  LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
229
240
 
230
- // buffer used to store the computation graph and the tensor meta data
231
- buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
241
+ gf_res_prev.reset(new llm_graph_result(max_nodes));
242
+ gf_res_reserve.reset(new llm_graph_result(max_nodes));
232
243
 
233
244
  // TODO: move these checks to ggml_backend_sched
234
245
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
@@ -267,7 +278,7 @@ llama_context::llama_context(
267
278
 
268
279
  // reserve worst-case graph
269
280
  if (!hparams.vocab_only && memory) {
270
- const uint32_t n_seqs = cparams.n_seq_max;
281
+ const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
271
282
  const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
272
283
 
273
284
  LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
@@ -300,7 +311,7 @@ llama_context::llama_context(
300
311
 
301
312
  // reserve with tg graph to get the number of splits and nodes
302
313
  {
303
- auto * gf = graph_reserve(1, 1, 1, mctx.get());
314
+ auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
304
315
  if (!gf) {
305
316
  throw std::runtime_error("failed to allocate compute tg buffers");
306
317
  }
@@ -311,6 +322,10 @@ llama_context::llama_context(
311
322
 
312
323
  // reserve again with pp graph to avoid ggml-alloc reallocations during inference
313
324
  {
325
+ // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
326
+ //
327
+ // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
328
+ //
314
329
  auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
315
330
  if (!gf) {
316
331
  throw std::runtime_error("failed to allocate compute pp buffers");
@@ -388,10 +403,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
388
403
  return sched.get();
389
404
  }
390
405
 
391
- ggml_context * llama_context::get_ctx_compute() const {
392
- return ctx_compute.get();
393
- }
394
-
395
406
  uint32_t llama_context::n_ctx() const {
396
407
  return cparams.n_ctx;
397
408
  }
@@ -463,6 +474,11 @@ bool llama_context::kv_self_update(bool optimize) {
463
474
  }
464
475
  }
465
476
 
477
+ // reset the previous graph result to make sure that it won't be reused
478
+ // TODO: change the mctx->apply() to return information if a graph reserve is needed
479
+ // reset the graph result only if the memory module did reset the scheduler
480
+ gf_res_prev->reset();
481
+
466
482
  if (!mctx->apply()) {
467
483
  LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__);
468
484
  }
@@ -475,7 +491,7 @@ bool llama_context::kv_self_update(bool optimize) {
475
491
  throw std::runtime_error("failed to initialize memory context");
476
492
  }
477
493
 
478
- const uint32_t n_seqs = cparams.n_seq_max;
494
+ const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
479
495
  const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
480
496
 
481
497
  auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@@ -492,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
492
508
  }
493
509
 
494
510
  float * llama_context::get_logits() {
511
+ output_reorder();
512
+
495
513
  return logits;
496
514
  }
497
515
 
498
516
  float * llama_context::get_logits_ith(int32_t i) {
499
517
  int64_t j = -1;
500
518
 
519
+ output_reorder();
520
+
501
521
  try {
502
522
  if (logits == nullptr) {
503
523
  throw std::runtime_error("no logits");
@@ -534,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
534
554
  }
535
555
 
536
556
  float * llama_context::get_embeddings() {
557
+ output_reorder();
558
+
537
559
  return embd;
538
560
  }
539
561
 
540
562
  float * llama_context::get_embeddings_ith(int32_t i) {
541
563
  int64_t j = -1;
542
564
 
565
+ output_reorder();
566
+
543
567
  try {
544
568
  if (embd == nullptr) {
545
569
  throw std::runtime_error("no embeddings");
@@ -678,38 +702,59 @@ bool llama_context::apply_adapter_cvec(
678
702
  return cvec.apply(model, data, len, n_embd, il_start, il_end);
679
703
  }
680
704
 
681
- llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
705
+ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
682
706
  if (mctx && !mctx->apply()) {
683
707
  LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
684
708
  ret = GGML_STATUS_FAILED;
685
709
  return nullptr;
686
710
  }
687
711
 
688
- auto * gf = graph_init();
689
- if (!gf) {
690
- LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
691
- ret = GGML_STATUS_FAILED;
692
- return nullptr;
693
- }
712
+ auto * res = gf_res_prev.get();
713
+ auto * gf = res->get_gf();
694
714
 
695
- auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
696
- if (!res) {
697
- LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
698
- ret = GGML_STATUS_FAILED;
699
- return nullptr;
700
- }
715
+ // the new graph parameters
716
+ // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
717
+ const auto gparams = graph_params(res, ubatch, mctx, gtype);
701
718
 
702
- // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
719
+ if (res->can_reuse(gparams)) {
720
+ //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
703
721
 
704
- if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
705
- LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
706
- ret = GGML_STATUS_ALLOC_FAILED;
707
- return nullptr;
722
+ n_reused++;
723
+ } else {
724
+ res->reset();
725
+
726
+ ggml_backend_sched_reset(sched.get());
727
+ ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
728
+
729
+ //const auto t_start_us = ggml_time_us();
730
+
731
+ gf = model.build_graph(gparams);
732
+
733
+ //LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
734
+
735
+ if (!gf) {
736
+ LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
737
+ ret = GGML_STATUS_FAILED;
738
+ return nullptr;
739
+ }
740
+
741
+ if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
742
+ LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
743
+ ret = GGML_STATUS_ALLOC_FAILED;
744
+ return nullptr;
745
+ }
708
746
  }
709
747
 
710
- res->set_inputs(&ubatch);
748
+ // set the input data for the input tensors
749
+ {
750
+ //const auto t_start_us = ggml_time_us();
751
+
752
+ res->set_inputs(&ubatch);
753
+
754
+ //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
755
+ }
711
756
 
712
- const auto status = graph_compute(gf, ubatch.n_tokens > 1);
757
+ const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
713
758
  if (status != GGML_STATUS_SUCCESS) {
714
759
  LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
715
760
  ret = status;
@@ -731,16 +776,19 @@ int llama_context::encode(const llama_batch & batch_inp) {
731
776
 
732
777
  const auto & hparams = model.hparams;
733
778
 
734
- const int64_t n_embd = hparams.n_embd;
779
+ const int64_t n_embd = hparams.n_embd;
780
+ const int32_t n_vocab = model.vocab.n_tokens();
735
781
 
736
782
  // note: during encode, we always pass the full sequence starting from pos = 0
737
- if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
783
+ if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
738
784
  LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
739
785
  return -1;
740
786
  }
741
787
 
742
788
  const uint32_t n_tokens = balloc->get_n_tokens();
743
789
 
790
+ // [TAG_NO_CACHE_PAD]
791
+ // TODO: add new split mode where we pad the input sequences so that ubatch.equal_seqs == true
744
792
  const llama_ubatch ubatch = balloc->split_simple(n_tokens);
745
793
 
746
794
  // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
@@ -767,9 +815,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
767
815
 
768
816
  n_outputs = n_tokens;
769
817
 
770
- ggml_backend_sched_reset(sched.get());
771
- ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
772
-
773
818
  const auto causal_attn_org = cparams.causal_attn;
774
819
 
775
820
  // always use non-causal attention for encoder graphs
@@ -778,7 +823,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
778
823
  cparams.causal_attn = false;
779
824
 
780
825
  ggml_status status;
781
- const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
826
+ const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
782
827
 
783
828
  cparams.causal_attn = causal_attn_org;
784
829
 
@@ -791,10 +836,20 @@ int llama_context::encode(const llama_batch & batch_inp) {
791
836
  }
792
837
  }
793
838
 
839
+ auto * t_logits = res->get_logits();
794
840
  auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
795
841
 
842
+ // extract logits
843
+ if (logits && t_logits) {
844
+ ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
845
+ GGML_ASSERT(backend_res != nullptr);
846
+ GGML_ASSERT(logits != nullptr);
847
+
848
+ ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float));
849
+ }
850
+
796
851
  // extract embeddings
797
- if (t_embd) {
852
+ if (embd && t_embd) {
798
853
  ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
799
854
  GGML_ASSERT(backend_embd != nullptr);
800
855
 
@@ -844,9 +899,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
844
899
  }
845
900
  }
846
901
 
847
- // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
848
- // overlap with device computation.
849
- ggml_backend_sched_reset(sched.get());
902
+ if (!supports_set_rows) {
903
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
904
+ // overlap with device computation.
905
+ ggml_backend_sched_reset(sched.get());
906
+ }
850
907
 
851
908
  // TODO: hacky solution
852
909
  if (model.arch == LLM_ARCH_T5 && t_embd) {
@@ -899,7 +956,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
899
956
  // when computing embeddings, all tokens are output
900
957
  const bool output_all = cparams.embeddings;
901
958
 
902
- if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
959
+ if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
903
960
  LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
904
961
  return -1;
905
962
  }
@@ -927,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
927
984
 
928
985
  // TODO: this clear of the buffer can easily be forgotten - need something better
929
986
  embd_seq.clear();
987
+ output_swaps.clear();
930
988
 
931
989
  bool did_optimize = false;
932
990
 
@@ -1005,11 +1063,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
1005
1063
  n_outputs = n_outputs_new;
1006
1064
  }
1007
1065
 
1008
- ggml_backend_sched_reset(sched.get());
1009
- ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
1010
-
1011
1066
  ggml_status status;
1012
- const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
1067
+ const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
1013
1068
 
1014
1069
  if (!res) {
1015
1070
  // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
@@ -1149,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
1149
1204
  // make the outputs have the same order they had in the user-provided batch
1150
1205
  // note: this is mostly relevant for recurrent models atm
1151
1206
  if (!sorted_output) {
1152
- const uint32_t n_vocab = model.vocab.n_tokens();
1153
- const uint64_t n_embd = model.hparams.n_embd;
1154
-
1155
1207
  GGML_ASSERT((size_t) n_outputs == out_ids.size());
1156
1208
 
1157
1209
  // TODO: is there something more efficient which also minimizes swaps?
@@ -1167,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
1167
1219
  continue;
1168
1220
  }
1169
1221
  std::swap(out_ids[i], out_ids[j_min]);
1170
- if (logits_size > 0) {
1171
- for (uint32_t k = 0; k < n_vocab; k++) {
1172
- std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
1173
- }
1174
- }
1175
- if (embd_size > 0) {
1176
- for (uint32_t k = 0; k < n_embd; k++) {
1177
- std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
1178
- }
1179
- }
1222
+
1223
+ // remember the swaps and apply them lazily upon logits/embeddings access
1224
+ output_swaps.push_back({ i, j_min });
1180
1225
  }
1181
1226
 
1182
1227
  std::fill(output_ids.begin(), output_ids.end(), -1);
@@ -1190,9 +1235,11 @@ int llama_context::decode(const llama_batch & batch_inp) {
1190
1235
  // wait for the computation to finish (automatically done when obtaining the model output)
1191
1236
  //synchronize();
1192
1237
 
1193
- // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1194
- // overlap with device computation.
1195
- ggml_backend_sched_reset(sched.get());
1238
+ if (!supports_set_rows) {
1239
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1240
+ // overlap with device computation.
1241
+ ggml_backend_sched_reset(sched.get());
1242
+ }
1196
1243
 
1197
1244
  return 0;
1198
1245
  }
@@ -1271,24 +1318,40 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
1271
1318
  return n_outputs_max;
1272
1319
  }
1273
1320
 
1321
+ void llama_context::output_reorder() {
1322
+ const uint32_t n_vocab = model.vocab.n_tokens();
1323
+ const uint64_t n_embd = model.hparams.n_embd;
1324
+
1325
+ for (uint32_t s = 0; s < output_swaps.size(); ++s) {
1326
+ const uint32_t i0 = output_swaps[s].i0;
1327
+ const uint32_t i1 = output_swaps[s].i1;
1328
+
1329
+ if (logits_size > 0) {
1330
+ for (uint32_t k = 0; k < n_vocab; k++) {
1331
+ std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
1332
+ }
1333
+ }
1334
+
1335
+ if (embd_size > 0) {
1336
+ for (uint32_t k = 0; k < n_embd; k++) {
1337
+ std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
1338
+ }
1339
+ }
1340
+ }
1341
+
1342
+ output_swaps.clear();
1343
+ }
1344
+
1274
1345
  //
1275
1346
  // graph
1276
1347
  //
1277
1348
 
1278
- int32_t llama_context::graph_max_nodes() const {
1279
- return std::max<int32_t>(65536, 5*model.n_tensors());
1349
+ uint32_t llama_context::graph_max_nodes() const {
1350
+ return std::max<uint32_t>(1024u, 8u*model.n_tensors());
1280
1351
  }
1281
1352
 
1282
- ggml_cgraph * llama_context::graph_init() {
1283
- ggml_init_params params = {
1284
- /*.mem_size =*/ buf_compute_meta.size(),
1285
- /*.mem_buffer =*/ buf_compute_meta.data(),
1286
- /*.no_alloc =*/ true,
1287
- };
1288
-
1289
- ctx_compute.reset(ggml_init(params));
1290
-
1291
- return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
1353
+ llm_graph_result * llama_context::get_gf_res_reserve() const {
1354
+ return static_cast<llm_graph_result *>(gf_res_reserve.get());
1292
1355
  }
1293
1356
 
1294
1357
  ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
@@ -1301,6 +1364,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1301
1364
  LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
1302
1365
  }
1303
1366
 
1367
+ ggml_backend_sched_reset(sched.get());
1368
+
1369
+ // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
1370
+ gf_res_prev->reset();
1371
+
1304
1372
  // store the n_outputs as it is, and restore it afterwards
1305
1373
  // TODO: not sure if needed, might simplify in the future by removing this
1306
1374
  const auto save_n_outputs = this->n_outputs;
@@ -1310,17 +1378,15 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1310
1378
  llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
1311
1379
  llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
1312
1380
 
1313
- auto * gf = graph_init();
1314
- auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
1381
+ auto * res = gf_res_reserve.get();
1315
1382
 
1316
- this->n_outputs = save_n_outputs;
1383
+ const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
1317
1384
 
1318
- if (!res) {
1319
- LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
1320
- return nullptr;
1321
- }
1385
+ res->reset();
1322
1386
 
1323
- ggml_backend_sched_reset(sched.get());
1387
+ auto * gf = model.build_graph(gparams);
1388
+
1389
+ this->n_outputs = save_n_outputs;
1324
1390
 
1325
1391
  // initialize scheduler with the specified graph
1326
1392
  if (!ggml_backend_sched_reserve(sched.get(), gf)) {
@@ -1331,28 +1397,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1331
1397
  return gf;
1332
1398
  }
1333
1399
 
1334
- llm_graph_result_ptr llama_context::graph_build(
1335
- ggml_context * ctx,
1336
- ggml_cgraph * gf,
1337
- const llama_ubatch & ubatch,
1338
- llm_graph_type gtype,
1339
- const llama_memory_context_i * mctx) {
1340
- return model.build_graph(
1341
- {
1342
- /*.ctx =*/ ctx,
1343
- /*.arch =*/ model.arch,
1344
- /*.hparams =*/ model.hparams,
1345
- /*.cparams =*/ cparams,
1346
- /*.ubatch =*/ ubatch,
1347
- /*.sched =*/ sched.get(),
1348
- /*.backend_cpu =*/ backend_cpu,
1349
- /*.cvec =*/ &cvec,
1350
- /*.loras =*/ &loras,
1351
- /*.mctx =*/ mctx,
1352
- /*.cross =*/ &cross,
1353
- /*.n_outputs =*/ n_outputs,
1354
- /*.cb =*/ graph_get_cb(),
1355
- }, gf, gtype);
1400
+ llm_graph_params llama_context::graph_params(
1401
+ llm_graph_result * res,
1402
+ const llama_ubatch & ubatch,
1403
+ const llama_memory_context_i * mctx,
1404
+ llm_graph_type gtype) const {
1405
+ return {
1406
+ /*.arch =*/ model.arch,
1407
+ /*.hparams =*/ model.hparams,
1408
+ /*.cparams =*/ cparams,
1409
+ /*.ubatch =*/ ubatch,
1410
+ /*.gtype =*/ gtype,
1411
+ /*.sched =*/ sched.get(),
1412
+ /*.backend_cpu =*/ backend_cpu,
1413
+ /*.cvec =*/ &cvec,
1414
+ /*.loras =*/ &loras,
1415
+ /*.mctx =*/ mctx,
1416
+ /*.cross =*/ &cross,
1417
+ /*.n_outputs =*/ n_outputs,
1418
+ /*.cb =*/ graph_get_cb(),
1419
+ /*.res =*/ res,
1420
+ };
1356
1421
  }
1357
1422
 
1358
1423
  ggml_status llama_context::graph_compute(
@@ -1930,6 +1995,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
1930
1995
  data.t_eval_ms = 1e-3 * t_eval_us;
1931
1996
  data.n_p_eval = std::max(1, n_p_eval);
1932
1997
  data.n_eval = std::max(1, n_eval);
1998
+ data.n_reused = std::max(0, n_reused);
1933
1999
 
1934
2000
  return data;
1935
2001
  }
@@ -1938,6 +2004,7 @@ void llama_context::perf_reset() {
1938
2004
  t_start_us = ggml_time_us();
1939
2005
  t_eval_us = n_eval = 0;
1940
2006
  t_p_eval_us = n_p_eval = 0;
2007
+ n_reused = 0;
1941
2008
  }
1942
2009
 
1943
2010
  //
@@ -2028,7 +2095,7 @@ void llama_context::opt_epoch_iter(
2028
2095
  batch.logits [pos_batch] = true;
2029
2096
  }
2030
2097
 
2031
- if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {
2098
+ if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
2032
2099
  LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
2033
2100
  return;
2034
2101
  }
@@ -2064,8 +2131,13 @@ void llama_context::opt_epoch_iter(
2064
2131
  break;
2065
2132
  }
2066
2133
 
2067
- auto * gf = graph_init();
2068
- auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
2134
+ auto * res = gf_res_prev.get();
2135
+
2136
+ const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
2137
+
2138
+ res->reset();
2139
+
2140
+ auto * gf = model.build_graph(gparams);
2069
2141
 
2070
2142
  struct ggml_context * ctx_compute_opt;
2071
2143
  {
@@ -2187,6 +2259,7 @@ llama_context_params llama_context_default_params() {
2187
2259
  /*.no_perf =*/ true,
2188
2260
  /*.op_offload =*/ true,
2189
2261
  /*.swa_full =*/ true,
2262
+ /*.kv_unified =*/ false,
2190
2263
  };
2191
2264
 
2192
2265
  return result;
@@ -2807,6 +2880,7 @@ void llama_perf_context_print(const llama_context * ctx) {
2807
2880
  LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
2808
2881
  __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
2809
2882
  LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
2883
+ LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused);
2810
2884
  }
2811
2885
 
2812
2886
  void llama_perf_context_reset(llama_context * ctx) {
@@ -35,8 +35,6 @@ struct llama_context {
35
35
 
36
36
  ggml_backend_sched_t get_sched() const;
37
37
 
38
- ggml_context * get_ctx_compute() const;
39
-
40
38
  uint32_t n_ctx() const;
41
39
  uint32_t n_ctx_per_seq() const;
42
40
  uint32_t n_batch() const;
@@ -96,7 +94,7 @@ struct llama_context {
96
94
  // if memory_context is provided, it will be applied first to the context's memory
97
95
  // ret contains the status of the graph computation
98
96
  // returns nullptr only if ret != GGML_STATUS_SUCCESS
99
- llm_graph_result_ptr process_ubatch(
97
+ llm_graph_result * process_ubatch(
100
98
  const llama_ubatch & ubatch,
101
99
  llm_graph_type gtype,
102
100
  llama_memory_context_i * mctx,
@@ -183,15 +181,17 @@ private:
183
181
  // Returns max number of outputs for which space was reserved.
184
182
  uint32_t output_reserve(int32_t n_outputs);
185
183
 
184
+ void output_reorder();
185
+
186
186
  //
187
187
  // graph
188
188
  //
189
189
 
190
190
  public:
191
- int32_t graph_max_nodes() const;
191
+ uint32_t graph_max_nodes() const;
192
192
 
193
- // zero-out inputs and create the ctx_compute for the compute graph
194
- ggml_cgraph * graph_init();
193
+ // can reuse the llm_graph_result instance of the context (for example to update a memory module)
194
+ llm_graph_result * get_gf_res_reserve() const;
195
195
 
196
196
  // returns the result of ggml_backend_sched_graph_compute_async execution
197
197
  ggml_status graph_compute(ggml_cgraph * gf, bool batched);
@@ -200,12 +200,11 @@ public:
200
200
  ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
201
201
 
202
202
  private:
203
- llm_graph_result_ptr graph_build(
204
- ggml_context * ctx,
205
- ggml_cgraph * gf,
206
- const llama_ubatch & ubatch,
207
- llm_graph_type gtype,
208
- const llama_memory_context_i * mctx);
203
+ llm_graph_params graph_params(
204
+ llm_graph_result * res,
205
+ const llama_ubatch & ubatch,
206
+ const llama_memory_context_i * mctx,
207
+ llm_graph_type gtype) const;
209
208
 
210
209
  llm_graph_cb graph_get_cb() const;
211
210
 
@@ -253,13 +252,18 @@ private:
253
252
 
254
253
  std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
255
254
 
255
+ struct swap_info {
256
+ uint32_t i0;
257
+ uint32_t i1;
258
+ };
259
+
260
+ std::vector<swap_info> output_swaps;
261
+
256
262
  ggml_backend_sched_ptr sched;
257
263
 
258
264
  ggml_backend_t backend_cpu = nullptr;
259
265
  std::vector<ggml_backend_ptr> backends;
260
266
 
261
- ggml_context_ptr ctx_compute;
262
-
263
267
  // training
264
268
  ggml_opt_context_t opt_ctx = nullptr;
265
269
 
@@ -275,14 +279,18 @@ private:
275
279
  std::vector<ggml_backend_t> backend_ptrs;
276
280
  std::vector<ggml_backend_buffer_type_t> backend_buft;
277
281
 
278
- // memory buffers used to evaluate the model
279
- std::vector<uint8_t> buf_compute_meta;
282
+ llm_graph_result_ptr gf_res_prev;
283
+ llm_graph_result_ptr gf_res_reserve;
280
284
 
281
285
  // host buffer for the model output (logits and embeddings)
282
286
  ggml_backend_buffer_ptr buf_output;
283
287
 
284
288
  bool has_evaluated_once = false;
285
289
 
290
+ // env: LLAMA_SET_ROWS (temporary)
291
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14285
292
+ bool supports_set_rows = false;
293
+
286
294
  // perf
287
295
  mutable int64_t t_start_us = 0;
288
296
  mutable int64_t t_load_us = 0;
@@ -294,4 +302,6 @@ private:
294
302
 
295
303
  mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
296
304
  mutable int32_t n_eval = 0; // number of eval calls
305
+
306
+ mutable int32_t n_reused = 0; // number of times the previous graph was reused
297
307
  };
@@ -11,8 +11,8 @@ struct llama_cparams {
11
11
  uint32_t n_batch;
12
12
  uint32_t n_ubatch;
13
13
  uint32_t n_seq_max;
14
- int n_threads; // number of threads to use for generation
15
- int n_threads_batch; // number of threads to use for batch processing
14
+ int32_t n_threads; // number of threads to use for generation
15
+ int32_t n_threads_batch; // number of threads to use for batch processing
16
16
 
17
17
  float rope_freq_base;
18
18
  float rope_freq_scale;
@@ -33,6 +33,7 @@ struct llama_cparams {
33
33
  bool no_perf;
34
34
  bool warmup;
35
35
  bool op_offload;
36
+ bool kv_unified;
36
37
 
37
38
  enum llama_pooling_type pooling_type;
38
39