@novastera-oss/llamarn 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  13. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  18. package/cpp/LlamaCppModel.cpp +56 -22
  19. package/cpp/build-info.cpp +2 -2
  20. package/cpp/llama.cpp/CMakeLists.txt +1 -2
  21. package/cpp/llama.cpp/README.md +4 -5
  22. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  23. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  24. package/cpp/llama.cpp/common/arg.cpp +24 -0
  25. package/cpp/llama.cpp/common/chat.cpp +37 -20
  26. package/cpp/llama.cpp/common/chat.h +2 -0
  27. package/cpp/llama.cpp/common/common.cpp +3 -0
  28. package/cpp/llama.cpp/common/common.h +5 -0
  29. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  30. package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
  31. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  32. package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
  33. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  34. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  35. package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
  36. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  100. package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
  101. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
  103. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
  104. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
  105. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  133. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  134. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  135. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  136. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  137. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
  138. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  141. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  142. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  144. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  168. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  169. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  170. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  172. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  173. package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
  177. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
  178. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
  179. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  180. package/cpp/llama.cpp/include/llama.h +8 -43
  181. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  182. package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
  183. package/cpp/llama.cpp/src/llama-arch.h +36 -1
  184. package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
  185. package/cpp/llama.cpp/src/llama-batch.h +105 -70
  186. package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
  187. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  188. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  189. package/cpp/llama.cpp/src/llama-context.h +13 -13
  190. package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
  191. package/cpp/llama.cpp/src/llama-graph.h +78 -79
  192. package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
  193. package/cpp/llama.cpp/src/llama-hparams.h +11 -0
  194. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
  195. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  196. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
  197. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
  198. package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
  199. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
  200. package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
  201. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
  202. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  203. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  204. package/cpp/llama.cpp/src/llama-memory.h +21 -22
  205. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  206. package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
  207. package/cpp/llama.cpp/src/llama-model.h +40 -0
  208. package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
  209. package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
  210. package/cpp/llama.cpp/src/llama-vocab.h +42 -0
  211. package/cpp/rn-utils.h +3 -0
  212. package/ios/include/chat.h +2 -0
  213. package/ios/include/common.h +5 -0
  214. package/ios/include/llama.h +8 -43
  215. package/ios/libs/llama.xcframework/Info.plist +19 -19
  216. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  217. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  218. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  219. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  220. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
  221. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  223. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  224. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  225. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  231. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  232. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  248. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  250. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
  254. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
  255. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  261. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  262. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  263. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  264. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  265. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  267. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  268. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
  269. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
  270. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
  274. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  275. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  276. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  277. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  278. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  279. package/package.json +1 -1
  280. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  315. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  316. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  317. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  318. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  319. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -32,16 +32,21 @@ enum llm_type {
32
32
  LLM_TYPE_190M,
33
33
  LLM_TYPE_220M,
34
34
  LLM_TYPE_250M,
35
+ LLM_TYPE_256M,
35
36
  LLM_TYPE_270M,
36
37
  LLM_TYPE_335M,
38
+ LLM_TYPE_350M,
37
39
  LLM_TYPE_410M,
38
40
  LLM_TYPE_450M,
39
41
  LLM_TYPE_475M,
42
+ LLM_TYPE_700M,
40
43
  LLM_TYPE_770M,
41
44
  LLM_TYPE_780M,
45
+ LLM_TYPE_0_3B,
42
46
  LLM_TYPE_0_5B,
43
47
  LLM_TYPE_0_6B,
44
48
  LLM_TYPE_1B,
49
+ LLM_TYPE_1_2B,
45
50
  LLM_TYPE_1_3B,
46
51
  LLM_TYPE_1_4B,
47
52
  LLM_TYPE_1_5B,
@@ -93,8 +98,11 @@ enum llm_type {
93
98
  LLM_TYPE_57B_A14B,
94
99
  LLM_TYPE_17B_16E, // llama4 Scout
95
100
  LLM_TYPE_17B_128E, // llama4 Maverick
101
+ LLM_TYPE_A13B,
96
102
  LLM_TYPE_30B_A3B,
97
103
  LLM_TYPE_235B_A22B,
104
+ LLM_TYPE_E2B,
105
+ LLM_TYPE_E4B,
98
106
  };
99
107
 
100
108
  std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
@@ -150,6 +158,12 @@ struct llama_layer_convnext {
150
158
  struct ggml_tensor * gamma = nullptr;
151
159
  };
152
160
 
161
+ struct llama_layer_shortconv {
162
+ struct ggml_tensor * in_proj = nullptr;
163
+ struct ggml_tensor * conv = nullptr;
164
+ struct ggml_tensor * out_proj = nullptr;
165
+ };
166
+
153
167
  struct llama_layer {
154
168
  // normalization
155
169
  struct ggml_tensor * attn_norm = nullptr;
@@ -169,6 +183,10 @@ struct llama_layer {
169
183
  struct ggml_tensor * ffn_sub_norm = nullptr;
170
184
  struct ggml_tensor * attn_norm_cross = nullptr;
171
185
  struct ggml_tensor * attn_norm_enc = nullptr;
186
+ struct ggml_tensor * ssm_norm = nullptr;
187
+ struct ggml_tensor * ssm_dt_norm = nullptr;
188
+ struct ggml_tensor * ssm_b_norm = nullptr;
189
+ struct ggml_tensor * ssm_c_norm = nullptr;
172
190
 
173
191
  // attention
174
192
  struct ggml_tensor * wq = nullptr;
@@ -316,9 +334,24 @@ struct llama_layer {
316
334
  struct ggml_tensor * ffn_up_scale = nullptr;
317
335
  struct ggml_tensor * ffn_down_scale = nullptr;
318
336
 
337
+ // altup & laurel
338
+ struct ggml_tensor * per_layer_inp_gate = nullptr;
339
+ struct ggml_tensor * per_layer_proj = nullptr;
340
+ struct ggml_tensor * per_layer_post_norm = nullptr;
341
+ struct ggml_tensor * altup_correct_coef = nullptr;
342
+ struct ggml_tensor * altup_correct_scale = nullptr;
343
+ struct ggml_tensor * altup_predict_coef = nullptr;
344
+ struct ggml_tensor * altup_router = nullptr;
345
+ struct ggml_tensor * altup_router_norm = nullptr;
346
+ struct ggml_tensor * laurel_l = nullptr;
347
+ struct ggml_tensor * laurel_r = nullptr;
348
+ struct ggml_tensor * laurel_post_norm = nullptr;
349
+
319
350
  struct llama_layer_posnet posnet;
320
351
 
321
352
  struct llama_layer_convnext convnext;
353
+
354
+ struct llama_layer_shortconv shortconv;
322
355
  };
323
356
 
324
357
  struct llama_model {
@@ -354,6 +387,13 @@ struct llama_model {
354
387
  struct ggml_tensor * conv1d = nullptr;
355
388
  struct ggml_tensor * conv1d_b = nullptr;
356
389
 
390
+ // gemma3n altup
391
+ struct ggml_tensor * tok_embd_per_layer = nullptr;
392
+ struct ggml_tensor * altup_proj = nullptr;
393
+ struct ggml_tensor * altup_unembd_proj = nullptr;
394
+ struct ggml_tensor * per_layer_model_proj = nullptr;
395
+ struct ggml_tensor * per_layer_proj_norm = nullptr;
396
+
357
397
  std::vector<llama_layer> layers;
358
398
 
359
399
  llama_model_params params;
@@ -1,5 +1,4 @@
1
1
  #include "llama-quant.h"
2
-
3
2
  #include "llama-impl.h"
4
3
  #include "llama-model.h"
5
4
  #include "llama-model-loader.h"
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
27
26
  }
28
27
  }
29
28
 
29
+ static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
30
+ if (prune.empty()) {
31
+ return orig_name;
32
+ }
33
+
34
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
35
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
36
+ const int blk = std::stoi(match[1]);
37
+ std::string new_name = orig_name;
38
+
39
+ if (mapped.count(blk)) {
40
+ // Already mapped, do nothing
41
+ } else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
42
+ mapped[blk] = "";
43
+ } else if (blk < prune.front()) {
44
+ mapped[blk] = std::to_string(blk);
45
+ next_id = blk + 1;
46
+ } else {
47
+ mapped[blk] = std::to_string(next_id);
48
+ ++next_id;
49
+ }
50
+
51
+ return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
52
+ }
53
+
54
+ return orig_name;
55
+ }
56
+
57
+ static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
58
+ if (mapped.empty()) {
59
+ return orig_name;
60
+ }
61
+
62
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
63
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
64
+ const std::string blk(match[1]);
65
+ std::string new_name = orig_name;
66
+
67
+ for (const auto & p : mapped) {
68
+ if (p.second == blk) {
69
+ LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
70
+ return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
71
+ }
72
+ }
73
+ GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
74
+ }
75
+
76
+ return orig_name;
77
+ }
78
+
30
79
  struct quantize_state_impl {
31
80
  const llama_model & model;
32
81
  const llama_model_quantize_params * params;
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
174
223
  new_type = GGML_TYPE_Q6_K;
175
224
  }
176
225
  }
177
- } else if (name == "token_embd.weight") {
226
+ } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
178
227
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
179
228
  new_type = qs.params->token_embedding_type;
180
229
  } else {
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
568
617
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
569
618
  gguf_context_ptr ctx_out { gguf_init_empty() };
570
619
 
620
+ std::vector<int> prune_list = {};
621
+ if (params->prune_layers) {
622
+ prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
623
+ }
624
+
571
625
  // copy the KV pairs from the input file
572
626
  gguf_set_kv (ctx_out.get(), ml.meta.get());
573
627
  gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -597,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
597
651
  }
598
652
  }
599
653
 
654
+ std::map<int, std::string> mapped;
655
+ int blk_id = 0;
656
+ int pruned_attention_w = 0;
657
+
600
658
  // make a list of weights
601
659
  std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
602
660
  tensors.reserve(ml.weights_map.size());
603
661
  for (const auto & it : ml.weights_map) {
662
+ const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
663
+ if (remapped_name.empty()) {
664
+ if (it.first.find("attn_v.weight") != std::string::npos ||
665
+ it.first.find("attn_qkv.weight") != std::string::npos ||
666
+ it.first.find("attn_kv_b.weight") != std::string::npos) {
667
+ pruned_attention_w++;
668
+ }
669
+ LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
670
+ continue;
671
+ } else if (remapped_name != it.first) {
672
+ ggml_set_name(it.second.tensor, remapped_name.c_str());
673
+ LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
674
+ }
604
675
  tensors.push_back(&it.second);
605
676
  }
677
+ if (!prune_list.empty()) {
678
+ gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
679
+ }
606
680
 
607
681
  // keep_split requires that the weights are sorted by split index
608
682
  if (params->keep_split) {
@@ -640,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
640
714
  if (llama_model_has_encoder(&model)) {
641
715
  n_attn_layer *= 3;
642
716
  }
643
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
717
+ GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
644
718
  }
645
719
 
646
720
  size_t total_size_org = 0;
@@ -681,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
681
755
  for (size_t i = 0; i < ctx_outs.size(); ++i) {
682
756
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
683
757
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
684
- gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
758
+ gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
685
759
  }
686
760
  }
687
761
 
@@ -756,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
756
830
  // NOTE: can't use LLM_TN here because the layer number is not known
757
831
  quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
758
832
 
833
+ // these are very small (e.g. 4x4)
834
+ quantize &= name.find("altup") == std::string::npos;
835
+ quantize &= name.find("laurel") == std::string::npos;
836
+
837
+ // these are not too big so keep them as it is
838
+ quantize &= name.find("per_layer_model_proj") == std::string::npos;
839
+
759
840
  // do not quantize positional embeddings and token types (BERT)
760
841
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
761
842
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
@@ -763,6 +844,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
763
844
  // do not quantize Mamba's small yet 2D weights
764
845
  // NOTE: can't use LLM_TN here because the layer number is not known
765
846
  quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
847
+ quantize &= name.find("shortconv.conv.weight") == std::string::npos;
766
848
 
767
849
  // do not quantize RWKV's small yet 2D weights
768
850
  quantize &= name.find("time_mix_first.weight") == std::string::npos;
@@ -832,7 +914,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
832
914
 
833
915
  const float * imatrix = nullptr;
834
916
  if (imatrix_data) {
835
- auto it = imatrix_data->find(tensor->name);
917
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
836
918
  if (it == imatrix_data->end()) {
837
919
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
838
920
  } else {
@@ -947,6 +1029,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
947
1029
  /*.imatrix =*/ nullptr,
948
1030
  /*.kv_overrides =*/ nullptr,
949
1031
  /*.tensor_type =*/ nullptr,
1032
+ /*.prune_layers =*/ nullptr
950
1033
  };
951
1034
 
952
1035
  return result;
@@ -351,6 +351,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
351
351
  break;
352
352
  case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
353
353
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
354
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
354
355
  regex_exprs = {
355
356
  // original regex from tokenizer.json
356
357
  // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -1269,6 +1270,7 @@ struct llama_vocab::impl {
1269
1270
  bool add_space_prefix = false;
1270
1271
  bool add_bos = false;
1271
1272
  bool add_eos = false;
1273
+ bool add_sep = false;
1272
1274
  bool ignore_merges = false;
1273
1275
  bool clean_spaces = false; // clean_up_tokenization_spaces
1274
1276
  bool remove_extra_whitespaces = false;
@@ -1421,6 +1423,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1421
1423
  special_sep_id = 102;
1422
1424
  special_pad_id = 0;
1423
1425
  special_mask_id = 103;
1426
+
1427
+ add_sep = true;
1424
1428
  } else if (tokenizer_model == "gpt2") {
1425
1429
  type = LLAMA_VOCAB_TYPE_BPE;
1426
1430
 
@@ -1519,7 +1523,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1519
1523
  tokenizer_pre == "llama-v3" ||
1520
1524
  tokenizer_pre == "llama-bpe"||
1521
1525
  tokenizer_pre == "falcon3" ||
1522
- tokenizer_pre == "pixtral") {
1526
+ tokenizer_pre == "falcon-h1" ||
1527
+ tokenizer_pre == "pixtral" ||
1528
+ tokenizer_pre == "midm-2.0" ||
1529
+ tokenizer_pre == "lfm2") {
1523
1530
  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1524
1531
  ignore_merges = true;
1525
1532
  add_bos = true;
@@ -1550,12 +1557,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1550
1557
  tokenizer_pre == "jina-es" ||
1551
1558
  tokenizer_pre == "jina-de" ||
1552
1559
  tokenizer_pre == "gigachat" ||
1553
- tokenizer_pre == "jina-v1-en" ||
1554
1560
  tokenizer_pre == "jina-v2-es" ||
1555
1561
  tokenizer_pre == "jina-v2-de" ||
1562
+ tokenizer_pre == "a.x-4.0") {
1563
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1564
+ } else if (
1565
+ tokenizer_pre == "jina-v1-en" ||
1556
1566
  tokenizer_pre == "jina-v2-code" ||
1557
1567
  tokenizer_pre == "roberta-bpe") {
1558
1568
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1569
+ add_sep = true;
1559
1570
  } else if (
1560
1571
  tokenizer_pre == "refact") {
1561
1572
  pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1650,6 +1661,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1650
1661
  tokenizer_pre == "seed-coder") {
1651
1662
  pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1652
1663
  clean_spaces = false;
1664
+ } else if (
1665
+ tokenizer_pre == "hunyuan") {
1666
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1667
+ clean_spaces = false;
1653
1668
  } else {
1654
1669
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1655
1670
  }
@@ -1665,6 +1680,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1665
1680
  clean_spaces = true;
1666
1681
  add_bos = true;
1667
1682
  add_eos = false;
1683
+ add_sep = true;
1668
1684
  } else if (type == LLAMA_VOCAB_TYPE_UGM) {
1669
1685
  pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1670
1686
  add_bos = false;
@@ -1801,7 +1817,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1801
1817
  }
1802
1818
  }
1803
1819
 
1804
- // Handle add_bos and add_eos
1820
+ // Handle add_bos, add_eos and add_sep
1805
1821
  {
1806
1822
  bool temp = true;
1807
1823
 
@@ -1811,6 +1827,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1811
1827
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
1812
1828
  add_eos = temp;
1813
1829
  }
1830
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
1831
+ add_sep = temp;
1832
+ }
1814
1833
  }
1815
1834
 
1816
1835
  // auto-detect special tokens by text
@@ -1829,6 +1848,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1829
1848
  || t.first == "<EOT>"
1830
1849
  || t.first == "_<EOT>"
1831
1850
  || t.first == "<|end▁of▁sentence|>" // DeepSeek
1851
+ || t.first == "<end_of_utterance>" // smoldocling
1832
1852
  ) {
1833
1853
  special_eot_id = t.second;
1834
1854
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1988,6 +2008,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1988
2008
  || t.first == "<EOT>"
1989
2009
  || t.first == "_<EOT>"
1990
2010
  || t.first == "<|end_of_text|>"
2011
+ || t.first == "<end_of_utterance>" // smoldocling
1991
2012
  ) {
1992
2013
  special_eog_ids.insert(t.second);
1993
2014
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -3000,6 +3021,10 @@ bool llama_vocab::get_add_eos() const {
3000
3021
  return pimpl->add_eos;
3001
3022
  }
3002
3023
 
3024
+ bool llama_vocab::get_add_sep() const {
3025
+ return pimpl->add_sep;
3026
+ }
3027
+
3003
3028
  bool llama_vocab::get_ignore_merges() const {
3004
3029
  return pimpl->ignore_merges;
3005
3030
  }
@@ -3060,6 +3085,11 @@ int32_t llama_vocab::tokenize(
3060
3085
  bool add_special,
3061
3086
  bool parse_special) const {
3062
3087
  auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3088
+ if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3089
+ LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3090
+ return std::numeric_limits<int32_t>::min();
3091
+ }
3092
+
3063
3093
  if (n_tokens_max < (int) res.size()) {
3064
3094
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3065
3095
  return -((int) res.size());
@@ -3191,6 +3221,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3191
3221
  return vocab->get_add_eos();
3192
3222
  }
3193
3223
 
3224
+ bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3225
+ return vocab->get_add_sep();
3226
+ }
3227
+
3194
3228
  llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3195
3229
  return vocab->token_fim_pre();
3196
3230
  }
@@ -6,6 +6,47 @@
6
6
  #include <vector>
7
7
  #include <memory>
8
8
 
9
+ // pre-tokenization types
10
+ enum llama_vocab_pre_type {
11
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15
+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16
+ LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17
+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18
+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32
+ LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33
+ LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37
+ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46
+ LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
+ };
49
+
9
50
  struct LLM_KV;
10
51
  struct llama_model_loader;
11
52
 
@@ -74,6 +115,7 @@ struct llama_vocab {
74
115
  bool get_add_space_prefix () const;
75
116
  bool get_add_bos () const;
76
117
  bool get_add_eos () const;
118
+ bool get_add_sep () const;
77
119
  bool get_ignore_merges () const;
78
120
  bool get_clean_spaces () const;
79
121
  bool get_remove_extra_whitespaces () const;
package/cpp/rn-utils.h CHANGED
@@ -54,6 +54,7 @@ struct CompletionOptions {
54
54
  float top_p = 0.9f;
55
55
  float top_k = 40.0f;
56
56
  float min_p = 0.05f;
57
+ float presence_penalty = 0.0f; // for reducing repetitions (0-2 range)
57
58
  int n_keep = 0;
58
59
  int n_probs = 0; // for log probabilities
59
60
  bool post_sampling_probs = false;
@@ -77,6 +78,7 @@ struct CompletionOptions {
77
78
  {"top_p", top_p},
78
79
  {"top_k", top_k},
79
80
  {"min_p", min_p},
81
+ {"presence_penalty", presence_penalty},
80
82
  {"n_predict", n_predict},
81
83
  {"n_keep", n_keep},
82
84
  {"n_probs", n_probs},
@@ -147,6 +149,7 @@ struct CompletionOptions {
147
149
  data["top_p"] = top_p;
148
150
  data["max_tokens"] = n_predict;
149
151
  data["stream"] = stream;
152
+ data["presence_penalty"] = presence_penalty;
150
153
 
151
154
  if (seed >= 0) {
152
155
  data["seed"] = seed;
@@ -7,6 +7,7 @@
7
7
  #include <chrono>
8
8
  #include <string>
9
9
  #include <vector>
10
+ #include <map>
10
11
 
11
12
  struct common_chat_templates;
12
13
 
@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
125
126
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126
127
  bool enable_thinking = true;
127
128
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
129
+ std::map<std::string, std::string> chat_template_kwargs;
128
130
  };
129
131
 
130
132
  struct common_chat_params {
@@ -8,6 +8,7 @@
8
8
  #include <string>
9
9
  #include <string_view>
10
10
  #include <vector>
11
+ #include <map>
11
12
  #include <sstream>
12
13
 
13
14
  #ifdef _WIN32
@@ -358,6 +359,7 @@ struct common_params {
358
359
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
359
360
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
360
361
  std::string embd_sep = "\n"; // separator of embeddings
362
+ std::string cls_sep = "\t"; // separator of classification sequences
361
363
 
362
364
  // server params
363
365
  int32_t port = 8080; // server listens on this network port
@@ -368,6 +370,7 @@ struct common_params {
368
370
 
369
371
  std::string hostname = "127.0.0.1";
370
372
  std::string public_path = ""; // NOLINT
373
+ std::string api_prefix = ""; // NOLINT
371
374
  std::string chat_template = ""; // NOLINT
372
375
  bool use_jinja = false; // NOLINT
373
376
  bool enable_chat_template = true;
@@ -380,6 +383,8 @@ struct common_params {
380
383
  std::string ssl_file_key = ""; // NOLINT
381
384
  std::string ssl_file_cert = ""; // NOLINT
382
385
 
386
+ std::map<std::string, std::string> default_template_kwargs;
387
+
383
388
  // "advanced" endpoints are disabled by default for better security
384
389
  bool webui = true;
385
390
  bool endpoint_slots = false;
@@ -79,46 +79,6 @@ extern "C" {
79
79
  LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
80
  };
81
81
 
82
- // pre-tokenization types
83
- enum llama_vocab_pre_type {
84
- LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
85
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
86
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
87
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
88
- LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
89
- LLAMA_VOCAB_PRE_TYPE_MPT = 5,
90
- LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
91
- LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
92
- LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
93
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
94
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
95
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
96
- LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
97
- LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
98
- LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
99
- LLAMA_VOCAB_PRE_TYPE_PORO = 15,
100
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
101
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
102
- LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
103
- LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
104
- LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
105
- LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
106
- LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
107
- LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
108
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
109
- LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
110
- LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
111
- LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
112
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
113
- LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
114
- LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
115
- LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
116
- LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
117
- LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118
- LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119
- LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
120
- };
121
-
122
82
  enum llama_rope_type {
123
83
  LLAMA_ROPE_TYPE_NONE = -1,
124
84
  LLAMA_ROPE_TYPE_NORM = 0,
@@ -390,6 +350,7 @@ extern "C" {
390
350
  void * imatrix; // pointer to importance matrix data
391
351
  void * kv_overrides; // pointer to vector containing overrides
392
352
  void * tensor_types; // pointer to vector containing tensor types
353
+ void * prune_layers; // pointer to vector containing layer indices to prune
393
354
  } llama_model_quantize_params;
394
355
 
395
356
  typedef struct llama_logit_bias {
@@ -943,12 +904,14 @@ extern "C" {
943
904
  // Requires the context to have a memory.
944
905
  // For encode-decoder contexts, processes the batch using the decoder.
945
906
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // Upon non-zero return values, the memory state is restored to the state before this call
907
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
908
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
909
+ // Upon other return values, the memory state is restored to the state before this call
947
910
  // 0 - success
948
911
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
949
- // 2 - aborted
912
+ // 2 - aborted (processed ubatches will remain in the context's memory)
950
913
  // -1 - invalid input batch
951
- // < -1 - error
914
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
952
915
  LLAMA_API int32_t llama_decode(
953
916
  struct llama_context * ctx,
954
917
  struct llama_batch batch);
@@ -1044,6 +1007,7 @@ extern "C" {
1044
1007
 
1045
1008
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1046
1009
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1010
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1047
1011
 
1048
1012
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1049
1013
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1051,7 @@ extern "C" {
1087
1051
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1088
1052
  /// @return Returns the number of tokens on success, no more than n_tokens_max
1089
1053
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1054
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1090
1055
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1091
1056
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1092
1057
  /// as plaintext. Does not insert a leading space.