@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -19,21 +19,13 @@ if (GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
19
19
  add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
20
20
  message(STATUS "Enabling bfloat16 glslc support")
21
21
  endif()
22
+ if (GGML_VULKAN_SHADER_DEBUG_INFO)
23
+ add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
24
+ message(STATUS "Enabling shader debug info")
25
+ endif()
22
26
 
23
27
  set(TARGET vulkan-shaders-gen)
24
28
  add_executable(${TARGET} vulkan-shaders-gen.cpp)
25
29
  install(TARGETS ${TARGET} RUNTIME)
26
30
  target_compile_features(${TARGET} PRIVATE cxx_std_17)
27
31
  target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
28
-
29
- # Configure output directories for MSVC builds
30
- if(MSVC)
31
- # Get the main project's runtime output directory if possible
32
- if(DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
33
- foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
34
- string(TOUPPER ${CONFIG} CONFIG)
35
- set_target_properties(${TARGET} PROPERTIES
36
- RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
37
- endforeach()
38
- endif()
39
- endif()
@@ -0,0 +1,98 @@
1
+ #version 450
2
+
3
+ #include "types.comp"
4
+
5
+ layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // src0 - kernel: [K, Cout, Cin]
6
+ layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; // src1 - input: [L, Cin]
7
+ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; // dst - result [KL, Cout]
8
+
9
+ layout(local_size_x = 128 , local_size_y = 1, local_size_z = 1) in;
10
+
11
+ layout (push_constant) uniform parameter {
12
+ uint32_t Cout;
13
+ uint32_t Cin;
14
+ uint32_t K;
15
+ uint32_t L;
16
+ uint32_t KL;
17
+
18
+ uint32_t nb01;
19
+ uint32_t nb02;
20
+ uint32_t nb11;
21
+ uint32_t nb1;
22
+
23
+ int32_t s0;
24
+ } p;
25
+
26
+
27
+ uint32_t Cout_idx = gl_WorkGroupID.x;
28
+ const uint32_t bs = gl_WorkGroupSize.x;
29
+ uint32_t tid = gl_LocalInvocationID.x;
30
+ // Code is more straightforward if we assume it is bs*s0+K instead of (bs-1)*s0+K.
31
+ uint32_t tmp_len = bs*p.s0+p.K;
32
+ shared D_TYPE tmp[4096];
33
+
34
+ uint splitWork(uint workSize){
35
+ return (bs + workSize -1) / bs;
36
+ }
37
+
38
+ void main(){
39
+ for(uint32_t i = 0; i < splitWork(tmp_len); i++){
40
+ uint32_t idx = i*bs+tid;
41
+ if(idx < tmp_len){
42
+ tmp[idx] = 0.0;
43
+ }
44
+ }
45
+
46
+ uint32_t L_blocks = splitWork(p.L);
47
+ for(uint32_t L_block_id = 0; L_block_id < L_blocks; L_block_id++){
48
+ if(L_block_id > 0){
49
+ barrier();
50
+ // Shift values in tmp to the current processing window
51
+ for(int i = 0; i < splitWork(tmp_len); i++){
52
+ uint32_t idx = i*bs+tid;
53
+ if(idx >= bs*p.s0 && idx < tmp_len){
54
+ tmp[idx-bs*p.s0] = tmp[idx];
55
+ tmp[idx] = 0.0;
56
+ }else if(idx >= p.K && idx < bs*p.s0){
57
+ tmp[idx] = 0.0;
58
+ }
59
+ }
60
+ }
61
+ barrier();
62
+
63
+ // Save contributions of the block to tmp
64
+ uint32_t L_idx = L_block_id*bs + tid;
65
+ for(uint32_t K_idx = 0; K_idx < p.K; K_idx++){
66
+ D_TYPE dp = 0.0;
67
+ for(uint32_t Cin_idx = 0; Cin_idx < p.Cin; Cin_idx++){
68
+ A_TYPE elemKrn = data_a[K_idx + Cout_idx * p.nb01 + Cin_idx * p.nb02];
69
+ if(L_idx < p.L){
70
+ B_TYPE elemInp = data_b[L_idx + Cin_idx*p.nb11];
71
+ dp = fma(elemKrn, elemInp, dp);
72
+ }
73
+ }
74
+ tmp[tid*p.s0 + K_idx] += dp;
75
+ barrier();
76
+ }
77
+
78
+ // Save the computed values except the last block that can have different size
79
+ uint32_t KLb_idx = L_block_id*bs*p.s0;
80
+ if(L_block_id < L_blocks-1){
81
+ for(uint32_t s0_idx = 0; s0_idx < p.s0; s0_idx++){
82
+ uint32_t sh_idx = p.s0*tid+s0_idx;
83
+ uint32_t KL_idx = KLb_idx+sh_idx;
84
+ if(KL_idx < p.KL){
85
+ data_d[KL_idx + Cout_idx*p.nb1] = tmp[sh_idx];
86
+ }
87
+ }
88
+ }
89
+ }
90
+
91
+ for(uint32_t i = 0; i < splitWork(tmp_len); i++){
92
+ uint32_t idx = i*bs+tid;
93
+ uint32_t KL_idx = (L_blocks-1)*bs*p.s0+idx;
94
+ if(KL_idx < p.KL){
95
+ data_d[KL_idx + Cout_idx*p.nb1] = tmp[idx];
96
+ }
97
+ }
98
+ }
@@ -622,6 +622,8 @@ void process_shaders() {
622
622
 
623
623
  string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
624
624
 
625
+ string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
626
+
625
627
  string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
626
628
 
627
629
  string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
@@ -61,9 +61,6 @@
61
61
  #define m512i(p) (__m512i)(p)
62
62
  #endif
63
63
 
64
- // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
65
- float ggml_table_f32_f16[1 << 16];
66
-
67
64
  #if defined(__linux__) || \
68
65
  defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
69
66
  (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
@@ -888,12 +885,6 @@ struct ggml_context {
888
885
  struct ggml_object * objects_end;
889
886
  };
890
887
 
891
- struct ggml_context_container {
892
- bool used;
893
-
894
- struct ggml_context context;
895
- };
896
-
897
888
  //
898
889
  // data types
899
890
  //
@@ -942,6 +933,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
942
933
  "TRANSPOSE",
943
934
  "GET_ROWS",
944
935
  "GET_ROWS_BACK",
936
+ "SET_ROWS",
945
937
  "DIAG",
946
938
  "DIAG_MASK_INF",
947
939
  "DIAG_MASK_ZERO",
@@ -961,6 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
961
953
  "UPSCALE",
962
954
  "PAD",
963
955
  "PAD_REFLECT_1D",
956
+ "ROLL",
964
957
  "ARANGE",
965
958
  "TIMESTEP_EMBEDDING",
966
959
  "ARGSORT",
@@ -991,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
991
984
  "OPT_STEP_ADAMW",
992
985
  };
993
986
 
994
- static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
987
+ static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
995
988
 
996
989
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
997
990
  "none",
@@ -1037,6 +1030,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1037
1030
  "transpose(x)",
1038
1031
  "get_rows(x)",
1039
1032
  "get_rows_back(x)",
1033
+ "set_rows(x)",
1040
1034
  "diag(x)",
1041
1035
  "diag_mask_inf(x)",
1042
1036
  "diag_mask_zero(x)",
@@ -1056,6 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1056
1050
  "upscale(x)",
1057
1051
  "pad(x)",
1058
1052
  "pad_reflect_1d(x)",
1053
+ "roll(x)",
1059
1054
  "arange(start, stop, step)",
1060
1055
  "timestep_embedding(timesteps, dim, max_period)",
1061
1056
  "argsort(x)",
@@ -1086,7 +1081,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1086
1081
  "adamw(x)",
1087
1082
  };
1088
1083
 
1089
- static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
1084
+ static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
1090
1085
 
1091
1086
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1092
1087
 
@@ -1355,6 +1350,12 @@ bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
1355
1350
  tensor->nb[2] == ggml_type_size(tensor->type);
1356
1351
  }
1357
1352
 
1353
+ bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor) {
1354
+ return
1355
+ tensor->ne[0] == ggml_blck_size(tensor->type) ||
1356
+ tensor->nb[0] == ggml_type_size(tensor->type);
1357
+ }
1358
+
1358
1359
  static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
1359
1360
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1360
1361
 
@@ -1426,14 +1427,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1426
1427
  // initialize time system (required on Windows)
1427
1428
  ggml_time_init();
1428
1429
 
1429
- for (int i = 0; i < (1 << 16); ++i) {
1430
- union {
1431
- uint16_t u16;
1432
- ggml_fp16_t fp16;
1433
- } u = {i};
1434
- ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
1435
- }
1436
-
1437
1430
  is_first_call = false;
1438
1431
  }
1439
1432
 
@@ -3399,6 +3392,35 @@ struct ggml_tensor * ggml_get_rows_back(
3399
3392
  return result;
3400
3393
  }
3401
3394
 
3395
+ // ggml_set_rows
3396
+
3397
+ struct ggml_tensor * ggml_set_rows(
3398
+ struct ggml_context * ctx,
3399
+ struct ggml_tensor * a,
3400
+ struct ggml_tensor * b,
3401
+ struct ggml_tensor * c) {
3402
+ GGML_ASSERT(a->ne[0] == b->ne[0]);
3403
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
3404
+ GGML_ASSERT(a->ne[3] == b->ne[3]);
3405
+ GGML_ASSERT(b->ne[1] == c->ne[0]);
3406
+ GGML_ASSERT(b->ne[2] % c->ne[1] == 0);
3407
+ GGML_ASSERT(b->ne[3] % c->ne[2] == 0);
3408
+ GGML_ASSERT(c->ne[3] == 1);
3409
+ GGML_ASSERT(b->type == GGML_TYPE_F32);
3410
+ GGML_ASSERT(c->type == GGML_TYPE_I64);
3411
+
3412
+ GGML_ASSERT(ggml_is_contiguous_rows(a));
3413
+ GGML_ASSERT(ggml_is_contiguous_rows(b));
3414
+
3415
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
3416
+
3417
+ result->op = GGML_OP_SET_ROWS;
3418
+ result->src[0] = b;
3419
+ result->src[1] = c;
3420
+
3421
+ return result;
3422
+ }
3423
+
3402
3424
  // ggml_diag
3403
3425
 
3404
3426
  struct ggml_tensor * ggml_diag(
@@ -4347,6 +4369,34 @@ struct ggml_tensor * ggml_pad_reflect_1d(
4347
4369
  return result;
4348
4370
  }
4349
4371
 
4372
+ // ggml_roll
4373
+
4374
+ struct ggml_tensor * ggml_roll(
4375
+ struct ggml_context * ctx,
4376
+ struct ggml_tensor * a,
4377
+ int shift0,
4378
+ int shift1,
4379
+ int shift2,
4380
+ int shift3) {
4381
+ GGML_ASSERT(a->nb[0] == ggml_type_size(a->type));
4382
+ GGML_ASSERT(abs(shift0) < a->ne[0]);
4383
+ GGML_ASSERT(abs(shift1) < a->ne[1]);
4384
+ GGML_ASSERT(abs(shift2) < a->ne[2]);
4385
+ GGML_ASSERT(abs(shift3) < a->ne[3]);
4386
+
4387
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
4388
+
4389
+ ggml_set_op_params_i32(result, 0, shift0);
4390
+ ggml_set_op_params_i32(result, 1, shift1);
4391
+ ggml_set_op_params_i32(result, 2, shift2);
4392
+ ggml_set_op_params_i32(result, 3, shift3);
4393
+
4394
+ result->op = GGML_OP_ROLL;
4395
+ result->src[0] = a;
4396
+
4397
+ return result;
4398
+ }
4399
+
4350
4400
  // ggml_arange
4351
4401
 
4352
4402
  struct ggml_tensor * ggml_arange(
@@ -335,7 +335,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
335
335
 
336
336
  for (uint32_t i = 0; i < magic.size(); i++) {
337
337
  if (magic[i] != GGUF_MAGIC[i]) {
338
- GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
338
+ char c0 = isprint(magic[0]) ? magic[0] : '?';
339
+ char c1 = isprint(magic[1]) ? magic[1] : '?';
340
+ char c2 = isprint(magic[2]) ? magic[2] : '?';
341
+ char c3 = isprint(magic[3]) ? magic[3] : '?';
342
+ GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3);
339
343
  gguf_free(ctx);
340
344
  return nullptr;
341
345
  }
@@ -118,6 +118,10 @@ class Keys:
118
118
  EMBEDDING_SCALE = "{arch}.embedding_scale"
119
119
  TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
120
120
  INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
121
+ ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
122
+ ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
123
+ ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
124
+ EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
121
125
 
122
126
  class Attention:
123
127
  HEAD_COUNT = "{arch}.attention.head_count"
@@ -142,6 +146,8 @@ class Keys:
142
146
  SCALE = "{arch}.attention.scale"
143
147
  KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
144
148
  VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
149
+ SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
150
+ SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
145
151
 
146
152
  class Rope:
147
153
  DIMENSION_COUNT = "{arch}.rope.dimension_count"
@@ -198,6 +204,7 @@ class Keys:
198
204
  MASK_ID = "tokenizer.ggml.mask_token_id"
199
205
  ADD_BOS = "tokenizer.ggml.add_bos_token"
200
206
  ADD_EOS = "tokenizer.ggml.add_eos_token"
207
+ ADD_SEP = "tokenizer.ggml.add_sep_token"
201
208
  ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
202
209
  REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
203
210
  PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
@@ -291,6 +298,7 @@ class MODEL_ARCH(IntEnum):
291
298
  BERT = auto()
292
299
  NOMIC_BERT = auto()
293
300
  NOMIC_BERT_MOE = auto()
301
+ NEO_BERT = auto()
294
302
  JINA_BERT_V2 = auto()
295
303
  BLOOM = auto()
296
304
  STABLELM = auto()
@@ -312,6 +320,7 @@ class MODEL_ARCH(IntEnum):
312
320
  GEMMA = auto()
313
321
  GEMMA2 = auto()
314
322
  GEMMA3 = auto()
323
+ GEMMA3N = auto()
315
324
  STARCODER2 = auto()
316
325
  RWKV6 = auto()
317
326
  RWKV6QWEN2 = auto()
@@ -343,6 +352,8 @@ class MODEL_ARCH(IntEnum):
343
352
  WAVTOKENIZER_DEC = auto()
344
353
  PLM = auto()
345
354
  BAILINGMOE = auto()
355
+ DOTS1 = auto()
356
+ ARCEE = auto()
346
357
 
347
358
 
348
359
  class VISION_PROJECTOR_TYPE(IntEnum):
@@ -395,6 +406,22 @@ class MODEL_TENSOR(IntEnum):
395
406
  ATTN_Q_NORM = auto()
396
407
  ATTN_K_NORM = auto()
397
408
  LAYER_OUT_NORM = auto()
409
+ PER_LAYER_TOKEN_EMBD = auto() # gemma3n
410
+ PER_LAYER_MODEL_PROJ = auto() # gemma3n
411
+ PER_LAYER_INP_GATE = auto() # gemma3n
412
+ PER_LAYER_PROJ = auto() # gemma3n
413
+ PER_LAYER_PROJ_NORM = auto() # gemma3n
414
+ PER_LAYER_POST_NORM = auto() # gemma3n
415
+ ALTUP_PROJ = auto() # gemma3n
416
+ ALTUP_UNEMBD_PROJ = auto() # gemma3n
417
+ ALTUP_CORRECT_COEF = auto() # gemma3n
418
+ ALTUP_CORRECT_SCALE = auto() # gemma3n
419
+ ALTUP_PREDICT_COEF = auto() # gemma3n
420
+ ALTUP_ROUTER = auto() # gemma3n
421
+ ALTUP_ROUTER_NORM = auto() # gemma3n
422
+ LAUREL_L = auto() # gemma3n
423
+ LAUREL_R = auto() # gemma3n
424
+ LAUREL_POST_NORM = auto() # gemma3n
398
425
  SSM_IN = auto()
399
426
  SSM_CONV1D = auto()
400
427
  SSM_X = auto()
@@ -571,6 +598,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
571
598
  MODEL_ARCH.BERT: "bert",
572
599
  MODEL_ARCH.NOMIC_BERT: "nomic-bert",
573
600
  MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
601
+ MODEL_ARCH.NEO_BERT: "neo-bert",
574
602
  MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
575
603
  MODEL_ARCH.BLOOM: "bloom",
576
604
  MODEL_ARCH.STABLELM: "stablelm",
@@ -592,6 +620,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
592
620
  MODEL_ARCH.GEMMA: "gemma",
593
621
  MODEL_ARCH.GEMMA2: "gemma2",
594
622
  MODEL_ARCH.GEMMA3: "gemma3",
623
+ MODEL_ARCH.GEMMA3N: "gemma3n",
595
624
  MODEL_ARCH.STARCODER2: "starcoder2",
596
625
  MODEL_ARCH.RWKV6: "rwkv6",
597
626
  MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
@@ -623,6 +652,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
623
652
  MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
624
653
  MODEL_ARCH.PLM: "plm",
625
654
  MODEL_ARCH.BAILINGMOE: "bailingmoe",
655
+ MODEL_ARCH.DOTS1: "dots1",
656
+ MODEL_ARCH.ARCEE: "arcee",
626
657
  }
627
658
 
628
659
  VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -675,6 +706,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
675
706
  MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
676
707
  MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
677
708
  MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
709
+ MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
710
+ MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
711
+ MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
712
+ MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n
713
+ MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n
714
+ MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n
715
+ MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n
716
+ MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n
717
+ MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n
718
+ MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n
719
+ MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n
720
+ MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n
721
+ MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n
722
+ MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n
723
+ MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n
724
+ MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n
678
725
  MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
679
726
  MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
680
727
  MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
@@ -1077,6 +1124,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1077
1124
  MODEL_TENSOR.FFN_UP_EXP,
1078
1125
  MODEL_TENSOR.LAYER_OUT_NORM,
1079
1126
  ],
1127
+ MODEL_ARCH.NEO_BERT: [
1128
+ MODEL_TENSOR.TOKEN_EMBD,
1129
+ MODEL_TENSOR.ATTN_NORM,
1130
+ MODEL_TENSOR.ATTN_QKV,
1131
+ MODEL_TENSOR.ATTN_OUT,
1132
+ MODEL_TENSOR.FFN_NORM,
1133
+ MODEL_TENSOR.FFN_DOWN,
1134
+ MODEL_TENSOR.FFN_UP,
1135
+ MODEL_TENSOR.ENC_OUTPUT_NORM,
1136
+ MODEL_TENSOR.CLS,
1137
+ MODEL_TENSOR.CLS_OUT,
1138
+ ],
1080
1139
  MODEL_ARCH.JINA_BERT_V2: [
1081
1140
  MODEL_TENSOR.TOKEN_EMBD,
1082
1141
  MODEL_TENSOR.TOKEN_EMBD_NORM,
@@ -1467,6 +1526,41 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1467
1526
  MODEL_TENSOR.FFN_PRE_NORM,
1468
1527
  MODEL_TENSOR.FFN_POST_NORM,
1469
1528
  ],
1529
+ MODEL_ARCH.GEMMA3N: [
1530
+ MODEL_TENSOR.TOKEN_EMBD,
1531
+ MODEL_TENSOR.OUTPUT,
1532
+ MODEL_TENSOR.OUTPUT_NORM,
1533
+ MODEL_TENSOR.ATTN_Q,
1534
+ MODEL_TENSOR.ATTN_Q_NORM,
1535
+ MODEL_TENSOR.ATTN_K,
1536
+ MODEL_TENSOR.ATTN_K_NORM,
1537
+ MODEL_TENSOR.ATTN_V,
1538
+ MODEL_TENSOR.ATTN_OUT,
1539
+ MODEL_TENSOR.FFN_GATE,
1540
+ MODEL_TENSOR.FFN_DOWN,
1541
+ MODEL_TENSOR.FFN_UP,
1542
+ MODEL_TENSOR.ATTN_NORM,
1543
+ MODEL_TENSOR.ATTN_POST_NORM,
1544
+ MODEL_TENSOR.FFN_PRE_NORM,
1545
+ MODEL_TENSOR.FFN_POST_NORM,
1546
+ # altup / laurel
1547
+ MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
1548
+ MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
1549
+ MODEL_TENSOR.PER_LAYER_INP_GATE,
1550
+ MODEL_TENSOR.PER_LAYER_PROJ,
1551
+ MODEL_TENSOR.PER_LAYER_PROJ_NORM,
1552
+ MODEL_TENSOR.PER_LAYER_POST_NORM,
1553
+ MODEL_TENSOR.ALTUP_PROJ,
1554
+ MODEL_TENSOR.ALTUP_UNEMBD_PROJ,
1555
+ MODEL_TENSOR.ALTUP_CORRECT_COEF,
1556
+ MODEL_TENSOR.ALTUP_CORRECT_SCALE,
1557
+ MODEL_TENSOR.ALTUP_PREDICT_COEF,
1558
+ MODEL_TENSOR.ALTUP_ROUTER,
1559
+ MODEL_TENSOR.ALTUP_ROUTER_NORM,
1560
+ MODEL_TENSOR.LAUREL_L,
1561
+ MODEL_TENSOR.LAUREL_R,
1562
+ MODEL_TENSOR.LAUREL_POST_NORM,
1563
+ ],
1470
1564
  MODEL_ARCH.STARCODER2: [
1471
1565
  MODEL_TENSOR.TOKEN_EMBD,
1472
1566
  MODEL_TENSOR.OUTPUT_NORM,
@@ -2044,6 +2138,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2044
2138
  MODEL_TENSOR.FFN_DOWN_SHEXP,
2045
2139
  MODEL_TENSOR.FFN_UP_SHEXP,
2046
2140
  ],
2141
+ MODEL_ARCH.DOTS1: [
2142
+ MODEL_TENSOR.TOKEN_EMBD,
2143
+ MODEL_TENSOR.OUTPUT_NORM,
2144
+ MODEL_TENSOR.OUTPUT,
2145
+ MODEL_TENSOR.ATTN_NORM,
2146
+ MODEL_TENSOR.ATTN_Q,
2147
+ MODEL_TENSOR.ATTN_Q_NORM,
2148
+ MODEL_TENSOR.ATTN_K,
2149
+ MODEL_TENSOR.ATTN_K_NORM,
2150
+ MODEL_TENSOR.ATTN_V,
2151
+ MODEL_TENSOR.ATTN_OUT,
2152
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
2153
+ MODEL_TENSOR.FFN_NORM,
2154
+ MODEL_TENSOR.FFN_GATE,
2155
+ MODEL_TENSOR.FFN_GATE_EXP,
2156
+ MODEL_TENSOR.FFN_GATE_INP,
2157
+ MODEL_TENSOR.FFN_GATE_SHEXP,
2158
+ MODEL_TENSOR.FFN_DOWN,
2159
+ MODEL_TENSOR.FFN_DOWN_EXP,
2160
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
2161
+ MODEL_TENSOR.FFN_UP,
2162
+ MODEL_TENSOR.FFN_UP_EXP,
2163
+ MODEL_TENSOR.FFN_UP_SHEXP,
2164
+ ],
2165
+ MODEL_ARCH.ARCEE: [
2166
+ MODEL_TENSOR.TOKEN_EMBD,
2167
+ MODEL_TENSOR.OUTPUT_NORM,
2168
+ MODEL_TENSOR.OUTPUT,
2169
+ MODEL_TENSOR.ROPE_FREQS,
2170
+ MODEL_TENSOR.ATTN_NORM,
2171
+ MODEL_TENSOR.ATTN_Q,
2172
+ MODEL_TENSOR.ATTN_K,
2173
+ MODEL_TENSOR.ATTN_V,
2174
+ MODEL_TENSOR.ATTN_OUT,
2175
+ MODEL_TENSOR.ATTN_ROT_EMBD,
2176
+ MODEL_TENSOR.FFN_NORM,
2177
+ MODEL_TENSOR.FFN_DOWN,
2178
+ MODEL_TENSOR.FFN_UP,
2179
+ ],
2047
2180
  # TODO
2048
2181
  }
2049
2182
 
@@ -271,7 +271,7 @@ class GGUFWriter:
271
271
 
272
272
  def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
273
273
  if any(key in kv_data for kv_data in self.kv_data):
274
- raise ValueError(f'Duplicated key name {key!r}')
274
+ logger.warning(f'Duplicated key name {key!r}, overwriting it with new value {val!r} of type {vtype.name}')
275
275
 
276
276
  self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
277
277
 
@@ -672,6 +672,18 @@ class GGUFWriter:
672
672
  def add_decoder_start_token_id(self, id: int) -> None:
673
673
  self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
674
674
 
675
+ def add_embedding_length_per_layer_input(self, value: int) -> None:
676
+ self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value)
677
+
678
+ def add_altup_active_idx(self, val: int) -> None:
679
+ self.add_uint32(Keys.LLM.ALTUP_ACTIVE_IDX.format(arch=self.arch), val)
680
+
681
+ def add_altup_num_inputs(self, val: int) -> None:
682
+ self.add_uint32(Keys.LLM.ALTUP_NUM_INPUTS.format(arch=self.arch), val)
683
+
684
+ def add_activation_sparsity_scale(self, values: Sequence[float]) -> None:
685
+ self.add_array(Keys.LLM.ACTIVATION_SPARSITY_SCALE.format(arch=self.arch), values)
686
+
675
687
  def add_head_count(self, count: int | Sequence[int]) -> None:
676
688
  if isinstance(count, int):
677
689
  self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
@@ -702,6 +714,12 @@ class GGUFWriter:
702
714
  def add_clamp_kqv(self, value: float) -> None:
703
715
  self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
704
716
 
717
+ def add_shared_kv_layers(self, value: float) -> None:
718
+ self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
719
+
720
+ def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
721
+ self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
722
+
705
723
  def add_logit_scale(self, value: float) -> None:
706
724
  self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
707
725
 
@@ -891,6 +909,9 @@ class GGUFWriter:
891
909
  def add_add_eos_token(self, value: bool) -> None:
892
910
  self.add_bool(Keys.Tokenizer.ADD_EOS, value)
893
911
 
912
+ def add_add_sep_token(self, value: bool) -> None:
913
+ self.add_bool(Keys.Tokenizer.ADD_SEP, value)
914
+
894
915
  def add_add_space_prefix(self, value: bool) -> None:
895
916
  self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
896
917
 
@@ -935,6 +956,9 @@ class GGUFWriter:
935
956
  def add_eom_token_id(self, id: int) -> None:
936
957
  self.add_uint32(Keys.Tokenizer.EOM_ID, id)
937
958
 
959
+ def add_classifier_output_labels(self, labels: Sequence[str]) -> None:
960
+ self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels)
961
+
938
962
  # for vision models
939
963
 
940
964
  def add_clip_has_vision_encoder(self, value: bool) -> None: