@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -310,6 +310,8 @@ class ModelBase:
310
310
  gguf.MODEL_TENSOR.POSNET_NORM2,
311
311
  gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
312
312
  gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
313
+ gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
314
+ gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
313
315
  )
314
316
  )
315
317
  or not new_name.endswith(".weight")
@@ -320,7 +322,11 @@ class ModelBase:
320
322
  self.match_model_tensor_name(new_name, key, bid)
321
323
  for key in (
322
324
  gguf.MODEL_TENSOR.TOKEN_EMBD,
325
+ gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
323
326
  gguf.MODEL_TENSOR.OUTPUT,
327
+ gguf.MODEL_TENSOR.ALTUP_ROUTER,
328
+ gguf.MODEL_TENSOR.LAUREL_L,
329
+ gguf.MODEL_TENSOR.LAUREL_R,
324
330
  )
325
331
  ):
326
332
  if self.ftype in (
@@ -519,7 +525,7 @@ class TextModel(ModelBase):
519
525
  def set_gguf_parameters(self):
520
526
  self.gguf_writer.add_block_count(self.block_count)
521
527
 
522
- if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
528
+ if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
523
529
  self.gguf_writer.add_context_length(n_ctx)
524
530
  logger.info(f"gguf: context length = {n_ctx}")
525
531
 
@@ -921,13 +927,20 @@ class TextModel(ModelBase):
921
927
  tokenizer = SentencePieceProcessor()
922
928
  tokenizer.LoadFromFile(str(tokenizer_path))
923
929
 
924
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
930
+ vocab_size = self.find_hparam([
931
+ "vocab_size_per_layer_input", # gemma3n
932
+ "vocab_size",
933
+ ], optional=True) or tokenizer.vocab_size()
925
934
 
926
935
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
927
936
  scores: list[float] = [-10000.0] * vocab_size
928
937
  toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
929
938
 
930
939
  for token_id in range(tokenizer.vocab_size()):
940
+ if token_id >= vocab_size:
941
+ logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
942
+ break
943
+
931
944
  piece = tokenizer.IdToPiece(token_id)
932
945
  text = piece.encode("utf-8")
933
946
  score = tokenizer.GetScore(token_id)
@@ -1898,9 +1911,7 @@ class LlamaModel(TextModel):
1898
1911
  hparams = self.hparams
1899
1912
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1900
1913
 
1901
- if "head_dim" in hparams:
1902
- rope_dim = hparams["head_dim"]
1903
- else:
1914
+ if (rope_dim := hparams.get("head_dim")) is None:
1904
1915
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1905
1916
  self.gguf_writer.add_rope_dimension_count(rope_dim)
1906
1917
 
@@ -1982,7 +1993,8 @@ class LlamaModel(TextModel):
1982
1993
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1983
1994
  if rope_scaling.get("rope_type", '').lower() == "llama3":
1984
1995
  base = self.hparams.get("rope_theta", 10000.0)
1985
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1996
+ if (dim := self.hparams.get("head_dim")) is None:
1997
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1986
1998
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1987
1999
 
1988
2000
  factor = rope_scaling.get("factor", 8.0)
@@ -2017,6 +2029,20 @@ class LlamaModel(TextModel):
2017
2029
  raise ValueError(f"Unprocessed experts: {experts}")
2018
2030
 
2019
2031
 
2032
+ @ModelBase.register("ArceeForCausalLM")
2033
+ class ArceeModel(LlamaModel):
2034
+ model_arch = gguf.MODEL_ARCH.ARCEE
2035
+
2036
+ def set_gguf_parameters(self):
2037
+ super().set_gguf_parameters()
2038
+ self._try_set_pooling_type()
2039
+ rope_scaling = self.hparams.get("rope_scaling") or {}
2040
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
2041
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2042
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2043
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2044
+
2045
+
2020
2046
  @ModelBase.register(
2021
2047
  "LlavaForConditionalGeneration", # pixtral
2022
2048
  "Mistral3ForConditionalGeneration", # mistral small 3.1
@@ -2132,7 +2158,6 @@ class Llama4Model(LlamaModel):
2132
2158
 
2133
2159
  def set_vocab(self):
2134
2160
  self._set_vocab_gpt2()
2135
- self.gguf_writer.add_add_bos_token(True)
2136
2161
 
2137
2162
  def set_gguf_parameters(self):
2138
2163
  super().set_gguf_parameters()
@@ -2181,7 +2206,7 @@ class Llama4VisionModel(MmprojModel):
2181
2206
  name += ".weight"
2182
2207
  if "multi_modal_projector.linear_1" in name:
2183
2208
  # despite the name with number postfix, this is a single fully connected layer
2184
- return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
2209
+ return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
2185
2210
  return [(self.map_tensor_name(name), data_torch)]
2186
2211
  return []
2187
2212
 
@@ -2304,9 +2329,7 @@ class DeciModel(TextModel):
2304
2329
  hparams = self.hparams
2305
2330
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2306
2331
 
2307
- if "head_dim" in hparams:
2308
- rope_dim = hparams["head_dim"]
2309
- else:
2332
+ if (rope_dim := hparams.get("head_dim")) is None:
2310
2333
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2311
2334
  self.gguf_writer.add_rope_dimension_count(rope_dim)
2312
2335
 
@@ -2346,7 +2369,8 @@ class DeciModel(TextModel):
2346
2369
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
2347
2370
  if rope_scaling.get("rope_type", '').lower() == "llama3":
2348
2371
  base = self.hparams.get("rope_theta", 10000.0)
2349
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
2372
+ if (dim := self.hparams.get("head_dim")) is None:
2373
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
2350
2374
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
2351
2375
 
2352
2376
  factor = rope_scaling.get("factor", 8.0)
@@ -3664,9 +3688,7 @@ class InternLM3Model(TextModel):
3664
3688
  hparams = self.hparams
3665
3689
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3666
3690
 
3667
- if "head_dim" in hparams:
3668
- rope_dim = hparams["head_dim"]
3669
- else:
3691
+ if (rope_dim := hparams.get("head_dim")) is None:
3670
3692
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
3671
3693
  self.gguf_writer.add_rope_dimension_count(rope_dim)
3672
3694
 
@@ -3709,8 +3731,7 @@ class BertModel(TextModel):
3709
3731
  self._try_set_pooling_type()
3710
3732
 
3711
3733
  if self.cls_out_labels:
3712
- key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
3713
- self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
3734
+ self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
3714
3735
 
3715
3736
  def set_vocab(self):
3716
3737
  tokens, toktypes, tokpre = self.get_vocab_base()
@@ -3909,9 +3930,6 @@ class BertModel(TextModel):
3909
3930
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3910
3931
  special_vocab.add_to_gguf(self.gguf_writer)
3911
3932
 
3912
- self.gguf_writer.add_add_bos_token(True)
3913
- self.gguf_writer.add_add_eos_token(True)
3914
-
3915
3933
 
3916
3934
  @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
3917
3935
  class DistilBertModel(BertModel):
@@ -3953,8 +3971,6 @@ class RobertaModel(BertModel):
3953
3971
  bpe_tok_path = self.dir_model / "tokenizer.json"
3954
3972
  if bpe_tok_path.exists():
3955
3973
  self._set_vocab_gpt2()
3956
- self.gguf_writer.add_add_bos_token(True)
3957
- self.gguf_writer.add_add_eos_token(True)
3958
3974
 
3959
3975
  # we need this to validate the size of the token_type embeddings
3960
3976
  # though currently we are passing all zeros to the token_type embeddings
@@ -4060,6 +4076,34 @@ class NomicBertModel(BertModel):
4060
4076
  raise ValueError(f"unknown tokenizer: {toktyp}")
4061
4077
 
4062
4078
 
4079
+ @ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
4080
+ class NeoBert(BertModel):
4081
+ model_arch = gguf.MODEL_ARCH.NEO_BERT
4082
+
4083
+ def set_gguf_parameters(self):
4084
+ super().set_gguf_parameters()
4085
+
4086
+ # NeoBERT uses 2/3 of the intermediate size as feed forward length
4087
+ self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
4088
+ self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
4089
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
4090
+
4091
+ f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
4092
+ self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
4093
+ logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
4094
+
4095
+ self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
4096
+
4097
+ def modify_tensors(self, data_torch, name, bid):
4098
+ if name.startswith("decoder."):
4099
+ return []
4100
+
4101
+ if name.startswith("model."):
4102
+ name = name[6:]
4103
+
4104
+ return super().modify_tensors(data_torch, name, bid)
4105
+
4106
+
4063
4107
  @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
4064
4108
  class XLMRobertaModel(BertModel):
4065
4109
  model_arch = gguf.MODEL_ARCH.BERT
@@ -4186,6 +4230,7 @@ class Gemma2Model(TextModel):
4186
4230
  @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
4187
4231
  class Gemma3Model(TextModel):
4188
4232
  model_arch = gguf.MODEL_ARCH.GEMMA3
4233
+ norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
4189
4234
 
4190
4235
  def set_vocab(self):
4191
4236
  self._set_vocab_sentencepiece()
@@ -4207,9 +4252,8 @@ class Gemma3Model(TextModel):
4207
4252
  self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
4208
4253
  self.gguf_writer.add_file_type(self.ftype)
4209
4254
  self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
4210
- # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
4255
+ # attn_logit_softcapping is removed in Gemma3
4211
4256
  assert hparams.get("attn_logit_softcapping") is None
4212
- assert hparams.get("final_logit_softcapping") is None
4213
4257
  self.gguf_writer.add_sliding_window(hparams["sliding_window"])
4214
4258
  self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
4215
4259
  if hparams.get("rope_scaling") is not None:
@@ -4221,7 +4265,7 @@ class Gemma3Model(TextModel):
4221
4265
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4222
4266
  del bid # unused
4223
4267
 
4224
- if name.startswith("language_model."):
4268
+ if "language_model." in name:
4225
4269
  name = name.replace("language_model.", "")
4226
4270
 
4227
4271
  elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
@@ -4236,8 +4280,9 @@ class Gemma3Model(TextModel):
4236
4280
 
4237
4281
  # ref code in Gemma3RMSNorm
4238
4282
  # output = output * (1.0 + self.weight.float())
4283
+ # note: this is not the case on gemma3n
4239
4284
  if name.endswith("norm.weight"):
4240
- data_torch = data_torch + 1
4285
+ data_torch = data_torch + self.norm_shift
4241
4286
 
4242
4287
  return [(self.map_tensor_name(name), data_torch)]
4243
4288
 
@@ -4294,6 +4339,104 @@ class Gemma3VisionModel(MmprojModel):
4294
4339
  return [] # skip other tensors
4295
4340
 
4296
4341
 
4342
+ @ModelBase.register("Gemma3nForConditionalGeneration")
4343
+ class Gemma3NModel(Gemma3Model):
4344
+ model_arch = gguf.MODEL_ARCH.GEMMA3N
4345
+ norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
4346
+
4347
+ _altup_proj: list[Tensor] = []
4348
+ _altup_unembd: list[Tensor] = []
4349
+
4350
+ def __init__(self, *args, **kwargs):
4351
+ super().__init__(*args, **kwargs)
4352
+ assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
4353
+ self._altup_proj = [
4354
+ torch.Tensor(), # to be replaced
4355
+ torch.Tensor(), # to be replaced
4356
+ torch.Tensor(), # to be replaced
4357
+ ]
4358
+ self._altup_unembd = [
4359
+ torch.Tensor(), # to be replaced
4360
+ torch.Tensor(), # to be replaced
4361
+ torch.Tensor(), # to be replaced
4362
+ ]
4363
+
4364
+ def set_vocab(self):
4365
+ with open(self.dir_model / "chat_template.jinja") as f:
4366
+ # quick hack to make sure chat template is added
4367
+ self.gguf_writer.add_chat_template(f.read())
4368
+ super().set_vocab()
4369
+
4370
+ def set_gguf_parameters(self):
4371
+ super().set_gguf_parameters()
4372
+ self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
4373
+ self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
4374
+ self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
4375
+ self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
4376
+
4377
+ activation_sparsity_scale = []
4378
+ for s in self.hparams["activation_sparsity_pattern"]:
4379
+ normal_dist = torch.distributions.normal.Normal(0, 1)
4380
+ std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
4381
+ activation_sparsity_scale.append(std_multiplier.item())
4382
+ self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
4383
+
4384
+ sliding_window_pattern = []
4385
+ for t in self.hparams["layer_types"]:
4386
+ sliding_window_pattern.append(t == "sliding_attention")
4387
+ self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
4388
+
4389
+ def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
4390
+ has_all = all(m.numel() > 0 for m in matrices)
4391
+ if not has_all:
4392
+ return None
4393
+ else:
4394
+ return torch.stack(matrices, dim=0)
4395
+
4396
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4397
+ if name.endswith("_scale"):
4398
+ name = name + ".weight"
4399
+
4400
+ # TODO: implement self.prediction_coefs.weight.clamp_(...)
4401
+
4402
+ if "language_model." not in name:
4403
+ return [] # skip non-language model tensors
4404
+
4405
+ if "altup_unembed_projections" in name:
4406
+ data_torch = data_torch.to(device="cpu")
4407
+ if ".0." in name:
4408
+ self._altup_unembd[0] = data_torch
4409
+ elif ".1." in name:
4410
+ self._altup_unembd[1] = data_torch
4411
+ elif ".2." in name:
4412
+ self._altup_unembd[2] = data_torch
4413
+ else:
4414
+ raise ValueError(f"Unknown name: {name}")
4415
+ out = self._stack_matrices(self._altup_unembd)
4416
+ if out is not None:
4417
+ return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
4418
+ else:
4419
+ return []
4420
+
4421
+ if "altup_projections" in name:
4422
+ data_torch = data_torch.to(device="cpu")
4423
+ if ".0." in name:
4424
+ self._altup_proj[0] = data_torch
4425
+ elif ".1." in name:
4426
+ self._altup_proj[1] = data_torch
4427
+ elif ".2." in name:
4428
+ self._altup_proj[2] = data_torch
4429
+ else:
4430
+ raise ValueError(f"Unknown name: {name}")
4431
+ out = self._stack_matrices(self._altup_proj)
4432
+ if out is not None:
4433
+ return [(self.map_tensor_name("model.altup_projections.weight"), out)]
4434
+ else:
4435
+ return []
4436
+
4437
+ return super().modify_tensors(data_torch, name, bid)
4438
+
4439
+
4297
4440
  @ModelBase.register("Starcoder2ForCausalLM")
4298
4441
  class StarCoder2Model(TextModel):
4299
4442
  model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -4799,25 +4942,6 @@ class OlmoeModel(TextModel):
4799
4942
  class JinaBertV2Model(BertModel):
4800
4943
  model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
4801
4944
 
4802
- def __init__(self, *args, **kwargs):
4803
- super().__init__(*args, **kwargs)
4804
- self.intermediate_size = self.hparams["intermediate_size"]
4805
-
4806
- def get_tensors(self):
4807
- for name, data in super().get_tensors():
4808
- if 'gated_layer' in name:
4809
- d1 = data[:self.intermediate_size, :]
4810
- name1 = name.replace('gated_layers', 'gated_layers_w')
4811
- name1 = name1.replace('up_gated_layer', 'gated_layers_v')
4812
- d2 = data[self.intermediate_size:, :]
4813
- name2 = name.replace('gated_layers', 'gated_layers_v')
4814
- name2 = name2.replace('up_gated_layer', 'gated_layers_w')
4815
- yield name1, d1
4816
- yield name2, d2
4817
- continue
4818
-
4819
- yield name, data
4820
-
4821
4945
  def set_vocab(self):
4822
4946
  tokenizer_class = 'BertTokenizer'
4823
4947
  with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
@@ -4830,16 +4954,6 @@ class JinaBertV2Model(BertModel):
4830
4954
  self.gguf_writer.add_token_type_count(2)
4831
4955
  else:
4832
4956
  raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
4833
- self.gguf_writer.add_add_bos_token(True)
4834
- self.gguf_writer.add_add_eos_token(True)
4835
-
4836
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4837
- # if name starts with "bert.", remove the prefix
4838
- # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
4839
- if name.startswith("bert."):
4840
- name = name[5:]
4841
-
4842
- return super().modify_tensors(data_torch, name, bid)
4843
4957
 
4844
4958
 
4845
4959
  @ModelBase.register("OpenELMForCausalLM")
@@ -5081,9 +5195,7 @@ class DeepseekModel(TextModel):
5081
5195
  def set_gguf_parameters(self):
5082
5196
  super().set_gguf_parameters()
5083
5197
  hparams = self.hparams
5084
- if "head_dim" in hparams:
5085
- rope_dim = hparams["head_dim"]
5086
- else:
5198
+ if (rope_dim := hparams.get("head_dim")) is None:
5087
5199
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
5088
5200
 
5089
5201
  self.gguf_writer.add_rope_dimension_count(rope_dim)
@@ -5287,6 +5399,34 @@ class DeepseekV2Model(TextModel):
5287
5399
  raise ValueError(f"Unprocessed experts: {experts}")
5288
5400
 
5289
5401
 
5402
+ @ModelBase.register("Dots1ForCausalLM")
5403
+ class Dots1Model(Qwen2MoeModel):
5404
+ model_arch = gguf.MODEL_ARCH.DOTS1
5405
+
5406
+ def __init__(self, *args, **kwargs):
5407
+ super().__init__(*args, **kwargs)
5408
+ self.hparams["num_experts"] = self.hparams["n_routed_experts"]
5409
+
5410
+ def set_gguf_parameters(self):
5411
+ super().set_gguf_parameters()
5412
+ self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5413
+ self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
5414
+ self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5415
+ self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
5416
+
5417
+ if self.hparams["scoring_func"] == "noaux_tc":
5418
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
5419
+ else:
5420
+ raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
5421
+
5422
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
5423
+ if name.endswith("e_score_correction_bias"):
5424
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5425
+ if "shared_experts" in name:
5426
+ return [(self.map_tensor_name(name), data_torch)]
5427
+ return super().modify_tensors(data_torch, name, bid)
5428
+
5429
+
5290
5430
  @ModelBase.register("PLMForCausalLM")
5291
5431
  class PLMModel(TextModel):
5292
5432
  model_arch = gguf.MODEL_ARCH.PLM
@@ -5415,9 +5555,6 @@ class T5Model(TextModel):
5415
5555
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5416
5556
  special_vocab.add_to_gguf(self.gguf_writer)
5417
5557
 
5418
- self.gguf_writer.add_add_bos_token(False)
5419
- self.gguf_writer.add_add_eos_token(True)
5420
-
5421
5558
  def set_gguf_parameters(self):
5422
5559
  if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
5423
5560
  logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5555,9 +5692,6 @@ class T5EncoderModel(TextModel):
5555
5692
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5556
5693
  special_vocab.add_to_gguf(self.gguf_writer)
5557
5694
 
5558
- self.gguf_writer.add_add_bos_token(False)
5559
- self.gguf_writer.add_add_eos_token(True)
5560
-
5561
5695
  def set_gguf_parameters(self):
5562
5696
  if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
5563
5697
  logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5945,7 +6079,8 @@ class ExaoneModel(TextModel):
5945
6079
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
5946
6080
  if rope_scaling.get("rope_type", '').lower() == "llama3":
5947
6081
  base = self.hparams.get("rope_theta", 10000.0)
5948
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
6082
+ if (dim := self.hparams.get("head_dim")) is None:
6083
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
5949
6084
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
5950
6085
 
5951
6086
  factor = rope_scaling.get("factor", 8.0)
@@ -6057,7 +6192,8 @@ class BailingMoeModel(TextModel):
6057
6192
  def set_gguf_parameters(self):
6058
6193
  super().set_gguf_parameters()
6059
6194
  hparams = self.hparams
6060
- rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
6195
+ if (rope_dim := hparams.get("head_dim")) is None:
6196
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
6061
6197
 
6062
6198
  self.gguf_writer.add_rope_dimension_count(rope_dim)
6063
6199
  rope_scaling = self.hparams.get("rope_scaling") or {}
@@ -6089,7 +6225,8 @@ class BailingMoeModel(TextModel):
6089
6225
  n_head = self.hparams["num_attention_heads"]
6090
6226
  n_kv_head = self.hparams.get("num_key_value_heads")
6091
6227
  n_embd = self.hparams["hidden_size"]
6092
- head_dim = self.hparams.get("head_dim") or n_embd // n_head
6228
+ if (head_dim := self.hparams.get("head_dim")) is None:
6229
+ head_dim = n_embd // n_head
6093
6230
 
6094
6231
  output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
6095
6232
 
@@ -6350,8 +6487,8 @@ def parse_args() -> argparse.Namespace:
6350
6487
  help="model is executed on big endian machine",
6351
6488
  )
6352
6489
  parser.add_argument(
6353
- "model", type=Path,
6354
- help="directory containing model file",
6490
+ "model", type=str,
6491
+ help="directory containing model file or huggingface repository ID (if --remote)",
6355
6492
  nargs="?",
6356
6493
  )
6357
6494
  parser.add_argument(
@@ -6454,18 +6591,20 @@ def main() -> None:
6454
6591
  else:
6455
6592
  logging.basicConfig(level=logging.INFO)
6456
6593
 
6457
- dir_model = args.model
6458
-
6459
6594
  if args.remote:
6595
+ hf_repo_id = args.model
6460
6596
  from huggingface_hub import snapshot_download
6461
6597
  local_dir = snapshot_download(
6462
- repo_id=str(dir_model),
6598
+ repo_id=hf_repo_id,
6463
6599
  allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
6464
6600
  dir_model = Path(local_dir)
6465
6601
  logger.info(f"Downloaded config and tokenizer to {local_dir}")
6602
+ else:
6603
+ hf_repo_id = None
6604
+ dir_model = Path(args.model)
6466
6605
 
6467
6606
  if not dir_model.is_dir():
6468
- logger.error(f'Error: {args.model} is not a directory')
6607
+ logger.error(f'Error: {dir_model} is not a directory')
6469
6608
  sys.exit(1)
6470
6609
 
6471
6610
  ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6485,9 +6624,9 @@ def main() -> None:
6485
6624
 
6486
6625
  if args.outfile is not None:
6487
6626
  fname_out = args.outfile
6488
- elif args.remote:
6627
+ elif hf_repo_id:
6489
6628
  # if remote, use the model ID as the output file name
6490
- fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
6629
+ fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
6491
6630
  else:
6492
6631
  fname_out = dir_model
6493
6632
 
@@ -6516,7 +6655,7 @@ def main() -> None:
6516
6655
  split_max_tensors=args.split_max_tensors,
6517
6656
  split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
6518
6657
  small_first_shard=args.no_tensor_first_split,
6519
- remote_hf_model_id=str(args.model) if args.remote else None)
6658
+ remote_hf_model_id=hf_repo_id)
6520
6659
 
6521
6660
  if args.vocab_only:
6522
6661
  logger.info("Exporting model vocab...")
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
105
105
  message(DEBUG "INS_ENB : ${INS_ENB}")
106
106
 
107
107
  option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
108
- option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
108
+ option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
109
109
  option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
110
110
  option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
111
111
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
@@ -131,13 +131,14 @@ option(GGML_RVV "ggml: enable rvv" ON)
131
131
  option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
132
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
133
133
  option(GGML_VXE "ggml: enable vxe" ON)
134
+ option(GGML_NNPA "ggml: enable nnpa" ON)
134
135
 
135
136
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
136
137
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
137
138
  set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
138
139
 
139
140
 
140
- if (WIN32)
141
+ if (MINGW)
141
142
  set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
142
143
  endif()
143
144
 
@@ -172,6 +173,7 @@ option(GGML_HIP "ggml: use HIP"
172
173
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
173
174
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
174
175
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
176
+ option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
175
177
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
176
178
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
177
179
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
@@ -367,6 +369,8 @@ if (MSVC)
367
369
  /wd4005 # Macro redefinition
368
370
  /wd4244 # Conversion from one type to another type, possible loss of data
369
371
  /wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
372
+ /wd4305 # Conversion from 'type1' to 'type2', possible loss of data
373
+ /wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
370
374
  /wd4996 # Disable POSIX deprecation warnings
371
375
  /wd4702 # Unreachable code warnings
372
376
  )
@@ -386,4 +390,46 @@ if (MSVC)
386
390
  disable_msvc_warnings(ggml-cpu-skylakex)
387
391
  disable_msvc_warnings(ggml-cpu-icelake)
388
392
  disable_msvc_warnings(ggml-cpu-alderlake)
393
+
394
+ if (GGML_BUILD_EXAMPLES)
395
+ disable_msvc_warnings(common-ggml)
396
+ disable_msvc_warnings(common)
397
+
398
+ disable_msvc_warnings(mnist-common)
399
+ disable_msvc_warnings(mnist-eval)
400
+ disable_msvc_warnings(mnist-train)
401
+
402
+ disable_msvc_warnings(gpt-2-ctx)
403
+ disable_msvc_warnings(gpt-2-alloc)
404
+ disable_msvc_warnings(gpt-2-backend)
405
+ disable_msvc_warnings(gpt-2-sched)
406
+ disable_msvc_warnings(gpt-2-quantize)
407
+ disable_msvc_warnings(gpt-2-batched)
408
+
409
+ disable_msvc_warnings(gpt-j)
410
+ disable_msvc_warnings(gpt-j-quantize)
411
+
412
+ disable_msvc_warnings(magika)
413
+ disable_msvc_warnings(yolov3-tiny)
414
+ disable_msvc_warnings(sam)
415
+
416
+ disable_msvc_warnings(simple-ctx)
417
+ disable_msvc_warnings(simple-backend)
418
+ endif()
419
+
420
+ if (GGML_BUILD_TESTS)
421
+ disable_msvc_warnings(test-mul-mat)
422
+ disable_msvc_warnings(test-arange)
423
+ disable_msvc_warnings(test-backend-ops)
424
+ disable_msvc_warnings(test-cont)
425
+ disable_msvc_warnings(test-conv-transpose)
426
+ disable_msvc_warnings(test-conv-transpose-1d)
427
+ disable_msvc_warnings(test-conv1d)
428
+ disable_msvc_warnings(test-conv2d)
429
+ disable_msvc_warnings(test-conv2d-dw)
430
+ disable_msvc_warnings(test-customop)
431
+ disable_msvc_warnings(test-dup)
432
+ disable_msvc_warnings(test-opt)
433
+ disable_msvc_warnings(test-pool)
434
+ endif ()
389
435
  endif()
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
36
36
  (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
37
37
  CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
38
38
  set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
39
- elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
40
- "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
39
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
41
40
  set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
42
41
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
43
42
  set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
@@ -101,6 +101,7 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
+ GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105
106
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106
107
 
@@ -133,6 +134,7 @@ extern "C" {
133
134
 
134
135
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
136
 
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
136
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);