@novastera-oss/llamarn 0.2.9 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  5. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  17. package/cpp/build-info.cpp +2 -2
  18. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  19. package/cpp/llama.cpp/README.md +4 -5
  20. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  21. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  22. package/cpp/llama.cpp/common/arg.cpp +17 -0
  23. package/cpp/llama.cpp/common/chat.cpp +37 -20
  24. package/cpp/llama.cpp/common/chat.h +2 -0
  25. package/cpp/llama.cpp/common/common.h +4 -0
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
  27. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  28. package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
  29. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  30. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  33. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  35. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  43. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  47. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  66. package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  68. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
  69. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
  70. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
  71. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
  73. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
  92. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  93. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  117. package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
  118. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
  120. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  121. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
  122. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  123. package/cpp/llama.cpp/include/llama.h +0 -40
  124. package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
  125. package/cpp/llama.cpp/src/llama-arch.h +18 -1
  126. package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
  127. package/cpp/llama.cpp/src/llama-batch.h +8 -1
  128. package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
  129. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  130. package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
  131. package/cpp/llama.cpp/src/llama-graph.h +47 -60
  132. package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
  133. package/cpp/llama.cpp/src/llama-hparams.h +3 -0
  134. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  138. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  139. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  141. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
  142. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  143. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  144. package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
  145. package/cpp/llama.cpp/src/llama-model.h +18 -0
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
  148. package/cpp/llama.cpp/src/llama-vocab.h +41 -0
  149. package/ios/include/chat.h +2 -0
  150. package/ios/include/common.h +4 -0
  151. package/ios/include/llama.h +0 -40
  152. package/ios/libs/llama.xcframework/Info.plist +19 -19
  153. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
  155. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
  158. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  163. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  164. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  165. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
  172. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  173. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  174. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
  175. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  176. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  177. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  178. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
  179. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  180. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
  183. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  184. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  185. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
  186. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  187. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  188. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  189. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  190. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  191. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  192. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  193. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  194. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  195. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
  196. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  197. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  198. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
  199. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
  202. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
  203. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  204. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  205. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  206. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  207. package/package.json +1 -1
  208. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  209. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  210. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  211. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  212. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  213. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  214. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  215. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  216. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  217. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  218. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  219. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  220. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  221. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  222. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  223. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  224. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  225. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  226. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  227. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  228. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  229. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  230. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  231. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  232. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  233. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  234. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  235. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  236. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  237. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  238. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  239. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  240. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  241. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  242. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  243. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  244. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  245. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  246. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  247. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -202,19 +202,34 @@ void ggml_print_backtrace(void) {
202
202
  }
203
203
  #endif
204
204
 
205
+ static ggml_abort_callback_t g_abort_callback = NULL;
206
+
207
+ // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
208
+ GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback) {
209
+ ggml_abort_callback_t ret_val = g_abort_callback;
210
+ g_abort_callback = callback;
211
+ return ret_val;
212
+ }
213
+
205
214
  void ggml_abort(const char * file, int line, const char * fmt, ...) {
206
215
  fflush(stdout);
207
216
 
208
- fprintf(stderr, "%s:%d: ", file, line);
217
+ char message[2048];
218
+ int offset = snprintf(message, sizeof(message), "%s:%d: ", file, line);
209
219
 
210
220
  va_list args;
211
221
  va_start(args, fmt);
212
- vfprintf(stderr, fmt, args);
222
+ vsnprintf(message + offset, sizeof(message) - offset, fmt, args);
213
223
  va_end(args);
214
224
 
215
- fprintf(stderr, "\n");
225
+ if (g_abort_callback) {
226
+ g_abort_callback(message);
227
+ } else {
228
+ // default: print error and backtrace to stderr
229
+ fprintf(stderr, "%s\n", message);
230
+ ggml_print_backtrace();
231
+ }
216
232
 
217
- ggml_print_backtrace();
218
233
  abort();
219
234
  }
220
235
 
@@ -458,6 +473,14 @@ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
458
473
  return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
459
474
  }
460
475
 
476
+ const char * ggml_version(void) {
477
+ return GGML_VERSION;
478
+ }
479
+
480
+ const char * ggml_commit(void) {
481
+ return GGML_COMMIT;
482
+ }
483
+
461
484
  //
462
485
  // timing
463
486
  //
@@ -945,6 +968,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
945
968
  "CONV_TRANSPOSE_1D",
946
969
  "IM2COL",
947
970
  "IM2COL_BACK",
971
+ "CONV_2D",
948
972
  "CONV_2D_DW",
949
973
  "CONV_TRANSPOSE_2D",
950
974
  "POOL_1D",
@@ -982,9 +1006,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
982
1006
  "CROSS_ENTROPY_LOSS",
983
1007
  "CROSS_ENTROPY_LOSS_BACK",
984
1008
  "OPT_STEP_ADAMW",
1009
+
1010
+ "GLU",
985
1011
  };
986
1012
 
987
- static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
1013
+ static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
988
1014
 
989
1015
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
990
1016
  "none",
@@ -1042,6 +1068,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1042
1068
  "conv_transpose_1d(x)",
1043
1069
  "im2col(x)",
1044
1070
  "im2col_back(x)",
1071
+ "conv_2d(x)",
1045
1072
  "conv_2d_dw(x)",
1046
1073
  "conv_transpose_2d(x)",
1047
1074
  "pool_1d(x)",
@@ -1079,9 +1106,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1079
1106
  "cross_entropy_loss(x,y)",
1080
1107
  "cross_entropy_loss_back(x,y)",
1081
1108
  "adamw(x)",
1109
+
1110
+ "glu(x)",
1082
1111
  };
1083
1112
 
1084
- static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
1113
+ static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
1085
1114
 
1086
1115
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1087
1116
 
@@ -1107,6 +1136,17 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1107
1136
  static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
1108
1137
 
1109
1138
 
1139
+ static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
1140
+ "REGLU",
1141
+ "GEGLU",
1142
+ "SWIGLU",
1143
+ "GEGLU_ERF",
1144
+ "GEGLU_QUICK",
1145
+ };
1146
+
1147
+ static_assert(GGML_GLU_OP_COUNT == 5, "GGML_GLU_OP_COUNT != 5");
1148
+
1149
+
1110
1150
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
1111
1151
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
1112
1152
 
@@ -1209,11 +1249,19 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
1209
1249
  return GGML_UNARY_OP_NAME[op];
1210
1250
  }
1211
1251
 
1252
+ const char * ggml_glu_op_name(enum ggml_glu_op op) {
1253
+ return GGML_GLU_OP_NAME[op];
1254
+ }
1255
+
1212
1256
  const char * ggml_op_desc(const struct ggml_tensor * t) {
1213
1257
  if (t->op == GGML_OP_UNARY) {
1214
1258
  enum ggml_unary_op uop = ggml_get_unary_op(t);
1215
1259
  return ggml_unary_op_name(uop);
1216
1260
  }
1261
+ if (t->op == GGML_OP_GLU) {
1262
+ enum ggml_glu_op gop = ggml_get_glu_op(t);
1263
+ return ggml_glu_op_name(gop);
1264
+ }
1217
1265
  return ggml_op_name(t->op);
1218
1266
  }
1219
1267
 
@@ -1730,6 +1778,11 @@ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
1730
1778
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
1731
1779
  }
1732
1780
 
1781
+ enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor) {
1782
+ GGML_ASSERT(tensor->op == GGML_OP_GLU);
1783
+ return (enum ggml_glu_op) ggml_get_op_params_i32(tensor, 0);
1784
+ }
1785
+
1733
1786
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
1734
1787
  return tensor->name;
1735
1788
  }
@@ -2609,6 +2662,156 @@ struct ggml_tensor * ggml_exp_inplace(
2609
2662
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXP);
2610
2663
  }
2611
2664
 
2665
+ // ggml_glu
2666
+
2667
+ static struct ggml_tensor * ggml_glu_impl(
2668
+ struct ggml_context * ctx,
2669
+ struct ggml_tensor * a,
2670
+ struct ggml_tensor * b,
2671
+ enum ggml_glu_op op,
2672
+ bool swapped) {
2673
+ GGML_ASSERT(ggml_is_contiguous_1(a));
2674
+
2675
+ if (b) {
2676
+ GGML_ASSERT(ggml_is_contiguous_1(b));
2677
+ GGML_ASSERT(ggml_are_same_shape(a, b));
2678
+ GGML_ASSERT(a->type == b->type);
2679
+ }
2680
+
2681
+ int64_t ne[GGML_MAX_DIMS] = { a->ne[0] / 2 }; for (int i = 1; i < GGML_MAX_DIMS; i++) ne[i] = a->ne[i];
2682
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b ? a->ne : ne, NULL, 0);
2683
+
2684
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
2685
+ ggml_set_op_params_i32(result, 1, (int32_t) swapped);
2686
+
2687
+ result->op = GGML_OP_GLU;
2688
+ result->src[0] = a;
2689
+ result->src[1] = b;
2690
+
2691
+ return result;
2692
+ }
2693
+
2694
+ struct ggml_tensor * ggml_glu(
2695
+ struct ggml_context * ctx,
2696
+ struct ggml_tensor * a,
2697
+ enum ggml_glu_op op,
2698
+ bool swapped) {
2699
+ return ggml_glu_impl(ctx, a, NULL, op, swapped);
2700
+ }
2701
+
2702
+ struct ggml_tensor * ggml_glu_split(
2703
+ struct ggml_context * ctx,
2704
+ struct ggml_tensor * a,
2705
+ struct ggml_tensor * b,
2706
+ enum ggml_glu_op op) {
2707
+ return ggml_glu_impl(ctx, a, b, op, false);
2708
+ }
2709
+
2710
+ // ggml_reglu
2711
+
2712
+ struct ggml_tensor * ggml_reglu(
2713
+ struct ggml_context * ctx,
2714
+ struct ggml_tensor * a) {
2715
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_REGLU, false);
2716
+ }
2717
+
2718
+ struct ggml_tensor * ggml_reglu_swapped(
2719
+ struct ggml_context * ctx,
2720
+ struct ggml_tensor * a) {
2721
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_REGLU, true);
2722
+ }
2723
+
2724
+ struct ggml_tensor * ggml_reglu_split(
2725
+ struct ggml_context * ctx,
2726
+ struct ggml_tensor * a,
2727
+ struct ggml_tensor * b) {
2728
+ return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_REGLU, false);
2729
+ }
2730
+
2731
+ // ggml_geglu
2732
+
2733
+ struct ggml_tensor * ggml_geglu(
2734
+ struct ggml_context * ctx,
2735
+ struct ggml_tensor * a) {
2736
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU, false);
2737
+ }
2738
+
2739
+ struct ggml_tensor * ggml_geglu_swapped(
2740
+ struct ggml_context * ctx,
2741
+ struct ggml_tensor * a) {
2742
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU, true);
2743
+ }
2744
+
2745
+ struct ggml_tensor * ggml_geglu_split(
2746
+ struct ggml_context * ctx,
2747
+ struct ggml_tensor * a,
2748
+ struct ggml_tensor * b) {
2749
+ return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU, false);
2750
+ }
2751
+
2752
+ // ggml_swiglu
2753
+
2754
+ struct ggml_tensor * ggml_swiglu(
2755
+ struct ggml_context * ctx,
2756
+ struct ggml_tensor * a) {
2757
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_SWIGLU, false);
2758
+ }
2759
+
2760
+ struct ggml_tensor * ggml_swiglu_swapped(
2761
+ struct ggml_context * ctx,
2762
+ struct ggml_tensor * a) {
2763
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_SWIGLU, true);
2764
+ }
2765
+
2766
+ struct ggml_tensor * ggml_swiglu_split(
2767
+ struct ggml_context * ctx,
2768
+ struct ggml_tensor * a,
2769
+ struct ggml_tensor * b) {
2770
+ return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU, false);
2771
+ }
2772
+
2773
+ // ggml_geglu_erf
2774
+
2775
+ struct ggml_tensor * ggml_geglu_erf(
2776
+ struct ggml_context * ctx,
2777
+ struct ggml_tensor * a) {
2778
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_ERF, false);
2779
+ }
2780
+
2781
+ struct ggml_tensor * ggml_geglu_erf_swapped(
2782
+ struct ggml_context * ctx,
2783
+ struct ggml_tensor * a) {
2784
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_ERF, true);
2785
+ }
2786
+
2787
+ struct ggml_tensor * ggml_geglu_erf_split(
2788
+ struct ggml_context * ctx,
2789
+ struct ggml_tensor * a,
2790
+ struct ggml_tensor * b) {
2791
+ return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_ERF, false);
2792
+ }
2793
+
2794
+ // ggml_geglu_quick
2795
+
2796
+ struct ggml_tensor * ggml_geglu_quick(
2797
+ struct ggml_context * ctx,
2798
+ struct ggml_tensor * a) {
2799
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_QUICK, false);
2800
+ }
2801
+
2802
+ struct ggml_tensor * ggml_geglu_quick_swapped(
2803
+ struct ggml_context * ctx,
2804
+ struct ggml_tensor * a) {
2805
+ return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU_QUICK, true);
2806
+ }
2807
+
2808
+ struct ggml_tensor * ggml_geglu_quick_split(
2809
+ struct ggml_context * ctx,
2810
+ struct ggml_tensor * a,
2811
+ struct ggml_tensor * b) {
2812
+ return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_QUICK, false);
2813
+ }
2814
+
2612
2815
  // ggml_norm
2613
2816
 
2614
2817
  static struct ggml_tensor * ggml_norm_impl(
@@ -2866,12 +3069,14 @@ static struct ggml_tensor * ggml_scale_impl(
2866
3069
  struct ggml_context * ctx,
2867
3070
  struct ggml_tensor * a,
2868
3071
  float s,
3072
+ float b,
2869
3073
  bool inplace) {
2870
3074
  GGML_ASSERT(ggml_is_padded_1d(a));
2871
3075
 
2872
3076
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
2873
3077
 
2874
- ggml_set_op_params(result, &s, sizeof(s));
3078
+ float params[2] = { s, b };
3079
+ ggml_set_op_params(result, &params, sizeof(params));
2875
3080
 
2876
3081
  result->op = GGML_OP_SCALE;
2877
3082
  result->src[0] = a;
@@ -2883,14 +3088,30 @@ struct ggml_tensor * ggml_scale(
2883
3088
  struct ggml_context * ctx,
2884
3089
  struct ggml_tensor * a,
2885
3090
  float s) {
2886
- return ggml_scale_impl(ctx, a, s, false);
3091
+ return ggml_scale_impl(ctx, a, s, 0.0, false);
2887
3092
  }
2888
3093
 
2889
3094
  struct ggml_tensor * ggml_scale_inplace(
2890
3095
  struct ggml_context * ctx,
2891
3096
  struct ggml_tensor * a,
2892
3097
  float s) {
2893
- return ggml_scale_impl(ctx, a, s, true);
3098
+ return ggml_scale_impl(ctx, a, s, 0.0, true);
3099
+ }
3100
+
3101
+ struct ggml_tensor * ggml_scale_bias(
3102
+ struct ggml_context * ctx,
3103
+ struct ggml_tensor * a,
3104
+ float s,
3105
+ float b) {
3106
+ return ggml_scale_impl(ctx, a, s, b, false);
3107
+ }
3108
+
3109
+ struct ggml_tensor * ggml_scale_bias_inplace(
3110
+ struct ggml_context * ctx,
3111
+ struct ggml_tensor * a,
3112
+ float s,
3113
+ float b) {
3114
+ return ggml_scale_impl(ctx, a, s, b, true);
2894
3115
  }
2895
3116
 
2896
3117
  // ggml_set
@@ -3515,9 +3736,10 @@ static struct ggml_tensor * ggml_soft_max_impl(
3515
3736
  if (mask) {
3516
3737
  GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
3517
3738
  GGML_ASSERT(ggml_is_contiguous(mask));
3518
- GGML_ASSERT(ggml_is_matrix(mask));
3519
3739
  GGML_ASSERT(mask->ne[0] == a->ne[0]);
3520
3740
  GGML_ASSERT(mask->ne[1] >= a->ne[1]);
3741
+ GGML_ASSERT(a->ne[2]%mask->ne[2] == 0);
3742
+ GGML_ASSERT(a->ne[3]%mask->ne[3] == 0);
3521
3743
  }
3522
3744
 
3523
3745
  if (max_bias > 0.0f) {
@@ -4157,6 +4379,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
4157
4379
  return result;
4158
4380
  }
4159
4381
 
4382
+ // ggml_conv_2d_direct
4383
+
4384
+ struct ggml_tensor * ggml_conv_2d_direct(
4385
+ struct ggml_context * ctx,
4386
+ struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
4387
+ struct ggml_tensor * b, // input data [W, H, C, N]
4388
+ int s0, // stride dimension 0
4389
+ int s1, // stride dimension 1
4390
+ int p0, // padding dimension 0
4391
+ int p1, // padding dimension 1
4392
+ int d0, // dilation dimension 0
4393
+ int d1) {// dilation dimension 1
4394
+
4395
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
4396
+ //GGML_ASSERT(a->type == b->type);
4397
+
4398
+ int64_t ne[4];
4399
+ ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
4400
+ ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
4401
+ ne[2] = a->ne[3];
4402
+ ne[3] = b->ne[3];
4403
+
4404
+ struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
4405
+
4406
+ ggml_set_op_params_i32(result, 0, s0);
4407
+ ggml_set_op_params_i32(result, 1, s1);
4408
+ ggml_set_op_params_i32(result, 2, p0);
4409
+ ggml_set_op_params_i32(result, 3, p1);
4410
+ ggml_set_op_params_i32(result, 4, d0);
4411
+ ggml_set_op_params_i32(result, 5, d1);
4412
+
4413
+ result->op = GGML_OP_CONV_2D;
4414
+ result->src[0] = a;
4415
+ result->src[1] = b;
4416
+
4417
+ return result;
4418
+ }
4419
+
4160
4420
  // ggml_conv_transpose_2d_p0
4161
4421
 
4162
4422
  static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
@@ -4273,24 +4533,21 @@ struct ggml_tensor * ggml_pool_2d_back(
4273
4533
  return result;
4274
4534
  }
4275
4535
 
4276
- // ggml_upscale
4536
+ // ggml_upscale / ggml_interpolate
4277
4537
 
4278
- static struct ggml_tensor * ggml_upscale_impl(
4538
+ static struct ggml_tensor * ggml_interpolate_impl(
4279
4539
  struct ggml_context * ctx,
4280
4540
  struct ggml_tensor * a,
4281
- int ne0,
4282
- int ne1,
4283
- int ne2,
4284
- int ne3,
4285
- enum ggml_scale_mode mode) {
4286
- GGML_ASSERT(a->ne[0] <= ne0);
4287
- GGML_ASSERT(a->ne[1] <= ne1);
4288
- GGML_ASSERT(a->ne[2] <= ne2);
4289
- GGML_ASSERT(a->ne[3] <= ne3);
4541
+ int64_t ne0,
4542
+ int64_t ne1,
4543
+ int64_t ne2,
4544
+ int64_t ne3,
4545
+ uint32_t mode) {
4546
+ GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT);
4290
4547
 
4291
4548
  struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
4292
4549
 
4293
- ggml_set_op_params_i32(result, 0, mode);
4550
+ ggml_set_op_params_i32(result, 0, (int32_t)mode);
4294
4551
 
4295
4552
  result->op = GGML_OP_UPSCALE;
4296
4553
  result->src[0] = a;
@@ -4303,7 +4560,8 @@ struct ggml_tensor * ggml_upscale(
4303
4560
  struct ggml_tensor * a,
4304
4561
  int scale_factor,
4305
4562
  enum ggml_scale_mode mode) {
4306
- return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode);
4563
+ GGML_ASSERT(scale_factor > 1);
4564
+ return ggml_interpolate_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode);
4307
4565
  }
4308
4566
 
4309
4567
  struct ggml_tensor * ggml_upscale_ext(
@@ -4314,7 +4572,18 @@ struct ggml_tensor * ggml_upscale_ext(
4314
4572
  int ne2,
4315
4573
  int ne3,
4316
4574
  enum ggml_scale_mode mode) {
4317
- return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
4575
+ return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
4576
+ }
4577
+
4578
+ struct ggml_tensor * ggml_interpolate(
4579
+ struct ggml_context * ctx,
4580
+ struct ggml_tensor * a,
4581
+ int64_t ne0,
4582
+ int64_t ne1,
4583
+ int64_t ne2,
4584
+ int64_t ne3,
4585
+ uint32_t mode) {
4586
+ return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
4318
4587
  }
4319
4588
 
4320
4589
  // ggml_pad
@@ -4491,13 +4760,17 @@ struct ggml_tensor * ggml_flash_attn_ext(
4491
4760
  GGML_ASSERT(ggml_can_mul_mat(k, q));
4492
4761
  // TODO: check if vT can be multiplied by (k*qT)
4493
4762
 
4763
+ GGML_ASSERT(q->ne[3] == k->ne[3]);
4764
+ GGML_ASSERT(q->ne[3] == v->ne[3]);
4765
+
4494
4766
  if (mask) {
4495
4767
  GGML_ASSERT(ggml_is_contiguous(mask));
4496
- GGML_ASSERT(mask->ne[2] == 1);
4497
- GGML_ASSERT(mask->ne[3] == 1);
4498
4768
  GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
4499
4769
  "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
4500
4770
  //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
4771
+
4772
+ GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
4773
+ GGML_ASSERT(q->ne[3] % mask->ne[3] == 0);
4501
4774
  }
4502
4775
 
4503
4776
  if (max_bias > 0.0f) {
@@ -4625,7 +4898,6 @@ struct ggml_tensor * ggml_ssm_conv(
4625
4898
  const int64_t n_s = sx->ne[2];
4626
4899
 
4627
4900
  // TODO: maybe support other strides than 1?
4628
- // FIXME: this is always true?
4629
4901
  GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
4630
4902
  GGML_ASSERT(sx->ne[1] == d_inner);
4631
4903
  GGML_ASSERT(n_t >= 0);
@@ -4648,36 +4920,49 @@ struct ggml_tensor * ggml_ssm_scan(
4648
4920
  struct ggml_tensor * dt,
4649
4921
  struct ggml_tensor * A,
4650
4922
  struct ggml_tensor * B,
4651
- struct ggml_tensor * C) {
4923
+ struct ggml_tensor * C,
4924
+ struct ggml_tensor * ids) {
4652
4925
  GGML_ASSERT(ggml_is_contiguous(s));
4653
- GGML_ASSERT(ggml_is_contiguous(x));
4654
4926
  GGML_ASSERT(ggml_is_contiguous(dt));
4655
4927
  GGML_ASSERT(ggml_is_contiguous(A));
4656
- GGML_ASSERT(ggml_is_matrix(A));
4657
- GGML_ASSERT(ggml_is_3d(B));
4658
- GGML_ASSERT(ggml_is_3d(s));
4928
+ GGML_ASSERT(x->nb[0] == ggml_type_size(x->type));
4659
4929
  GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
4660
4930
  GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
4661
- GGML_ASSERT(ggml_are_same_shape(x, dt));
4931
+ GGML_ASSERT(x->nb[1] == x->ne[0]*x->nb[0]);
4932
+ GGML_ASSERT(B->nb[1] == B->ne[0]*B->nb[0]);
4933
+ GGML_ASSERT(C->nb[1] == C->ne[0]*C->nb[0]);
4662
4934
  GGML_ASSERT(ggml_are_same_shape(B, C));
4935
+ GGML_ASSERT(ids->type == GGML_TYPE_I32);
4663
4936
 
4664
4937
  {
4665
4938
  const int64_t d_state = s->ne[0];
4666
- const int64_t d_inner = s->ne[1];
4667
- const int64_t n_seq_tokens = x->ne[1];
4668
- const int64_t n_seqs = x->ne[2];
4669
-
4670
- GGML_ASSERT(s->ne[2] == n_seqs);
4671
- GGML_ASSERT(x->ne[0] == d_inner);
4672
- GGML_ASSERT(A->ne[0] == d_state);
4673
- GGML_ASSERT(A->ne[1] == d_inner);
4939
+ const int64_t head_dim = x->ne[0];
4940
+ const int64_t n_head = x->ne[1];
4941
+ const int64_t n_seq_tokens = x->ne[2];
4942
+ const int64_t n_seqs = x->ne[3];
4943
+
4944
+ GGML_ASSERT(dt->ne[0] == n_head);
4945
+ GGML_ASSERT(dt->ne[1] == n_seq_tokens);
4946
+ GGML_ASSERT(dt->ne[2] == n_seqs);
4947
+ GGML_ASSERT(ggml_is_3d(dt));
4948
+ GGML_ASSERT(s->ne[1] == head_dim);
4949
+ GGML_ASSERT(s->ne[2] == n_head);
4674
4950
  GGML_ASSERT(B->ne[0] == d_state);
4675
- GGML_ASSERT(B->ne[1] == n_seq_tokens);
4676
- GGML_ASSERT(B->ne[2] == n_seqs);
4951
+ GGML_ASSERT(B->ne[2] == n_seq_tokens);
4952
+ GGML_ASSERT(B->ne[3] == n_seqs);
4953
+ GGML_ASSERT(ids->ne[0] == n_seqs);
4954
+ GGML_ASSERT(ggml_is_vector(ids));
4955
+ GGML_ASSERT(A->ne[1] == n_head);
4956
+ GGML_ASSERT(ggml_is_matrix(A));
4957
+
4958
+ if (A->ne[0] != 1) {
4959
+ // Mamba-1 has more granular decay factors
4960
+ GGML_ASSERT(A->ne[0] == d_state);
4961
+ }
4677
4962
  }
4678
4963
 
4679
4964
  // concatenated y + ssm_states
4680
- struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
4965
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + s->ne[0]*s->ne[1]*s->ne[2]*ids->ne[0]);
4681
4966
 
4682
4967
  result->op = GGML_OP_SSM_SCAN;
4683
4968
  result->src[0] = s;
@@ -4686,6 +4971,7 @@ struct ggml_tensor * ggml_ssm_scan(
4686
4971
  result->src[3] = A;
4687
4972
  result->src[4] = B;
4688
4973
  result->src[5] = C;
4974
+ result->src[6] = ids;
4689
4975
 
4690
4976
  return result;
4691
4977
  }
@@ -5509,7 +5795,7 @@ static void ggml_compute_backward(
5509
5795
  } break;
5510
5796
  case GGML_OP_MEAN: {
5511
5797
  if (src0_needs_grads) {
5512
- ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
5798
+ ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], 0.0, false));
5513
5799
  }
5514
5800
  } break;
5515
5801
  case GGML_OP_REPEAT: {
@@ -5586,7 +5872,7 @@ static void ggml_compute_backward(
5586
5872
  if (src0_needs_grads) {
5587
5873
  float s;
5588
5874
  memcpy(&s, tensor->op_params, sizeof(float));
5589
- ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, false));
5875
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, 0.0, false));
5590
5876
  }
5591
5877
  } break;
5592
5878
  case GGML_OP_SET: {
@@ -5826,13 +6112,28 @@ static void ggml_compute_backward(
5826
6112
  }
5827
6113
  GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
5828
6114
  } break;
6115
+ case GGML_OP_GLU: {
6116
+ switch (ggml_get_glu_op(tensor)) {
6117
+ case GGML_GLU_OP_SWIGLU: {
6118
+ if (src0_needs_grads) {
6119
+ GGML_ASSERT(src1 && "backward pass only implemented for split swiglu");
6120
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, ggml_mul(ctx, grad, src1), src0));
6121
+ }
6122
+ if (src1_needs_grads) {
6123
+ ggml_add_or_set(ctx, cgraph, isrc1, ggml_mul(ctx, ggml_silu(ctx, src0), grad));
6124
+ }
6125
+ } break;
6126
+ default: {
6127
+ GGML_ABORT("unsupported glu op for backward pass: %s", ggml_glu_op_name(ggml_get_glu_op(tensor)));
6128
+ } //break;
6129
+ }
6130
+ } break;
5829
6131
  case GGML_OP_NONE: {
5830
6132
  // noop
5831
6133
  } break;
5832
6134
  case GGML_OP_COUNT:
5833
6135
  default: {
5834
- fprintf(stderr, "%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
5835
- GGML_ABORT("fatal error");
6136
+ GGML_ABORT("%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
5836
6137
  } //break;
5837
6138
  }
5838
6139
 
@@ -5841,19 +6142,32 @@ static void ggml_compute_backward(
5841
6142
  GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
5842
6143
  }
5843
6144
 
5844
- static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
6145
+ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
5845
6146
  // check if already visited
5846
- if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
5847
- return;
6147
+ size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
6148
+ GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL);
6149
+ if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
6150
+ // This is the first time we see this node in the current graph.
6151
+ cgraph->visited_hash_set.keys[node_hash_pos] = node;
6152
+ ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
6153
+ cgraph->use_counts[node_hash_pos] = 0;
6154
+ } else {
6155
+ // already visited
6156
+ return node_hash_pos;
5848
6157
  }
5849
6158
 
5850
6159
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
5851
6160
  const int k =
5852
6161
  (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
5853
6162
  (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
5854
- /* unknown order, just fall back to using i*/ i;
5855
- if (node->src[k]) {
5856
- ggml_visit_parents(cgraph, node->src[k]);
6163
+ /* unknown order, just fall back to using i */ i;
6164
+
6165
+ struct ggml_tensor * src = node->src[k];
6166
+ if (src) {
6167
+ size_t src_hash_pos = ggml_visit_parents(cgraph, src);
6168
+
6169
+ // Update the use count for this operand.
6170
+ cgraph->use_counts[src_hash_pos]++;
5857
6171
  }
5858
6172
  }
5859
6173
 
@@ -5877,6 +6191,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
5877
6191
  cgraph->nodes[cgraph->n_nodes] = node;
5878
6192
  cgraph->n_nodes++;
5879
6193
  }
6194
+
6195
+ return node_hash_pos;
5880
6196
  }
5881
6197
 
5882
6198
  static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
@@ -6014,6 +6330,7 @@ static size_t ggml_graph_nbytes(size_t size, bool grads) {
6014
6330
  incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
6015
6331
  incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
6016
6332
  incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
6333
+ incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t)); // use_counts
6017
6334
  incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
6018
6335
  if (grads) {
6019
6336
  incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
@@ -6043,11 +6360,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
6043
6360
 
6044
6361
  void * p = cgraph + 1;
6045
6362
 
6046
- struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
6047
- struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
6048
- struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
6049
- struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
6050
- struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
6363
+ struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
6364
+ struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
6365
+ int32_t * use_counts_ptr = incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t));
6366
+ struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
6367
+ struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
6368
+ struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
6051
6369
 
6052
6370
  ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
6053
6371
 
@@ -6062,6 +6380,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
6062
6380
  /*.grads =*/ grads_ptr,
6063
6381
  /*.grad_accs =*/ grad_accs_ptr,
6064
6382
  /*.leafs =*/ leafs_ptr,
6383
+ /*.use_counts =*/ use_counts_ptr,
6065
6384
  /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
6066
6385
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
6067
6386
  };
@@ -6088,7 +6407,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
6088
6407
  /*.grads =*/ NULL, // gradients would need visited_hash_set
6089
6408
  /*.grad_accs =*/ NULL,
6090
6409
  /*.leafs =*/ NULL,
6091
- /*.visited_hash_set =*/ { 0, NULL, NULL },
6410
+ /*.use_counts =*/ cgraph0->use_counts,
6411
+ /*.visited_hash_set =*/ cgraph0->visited_hash_set,
6092
6412
  /*.order =*/ cgraph0->order,
6093
6413
  };
6094
6414
 
@@ -6115,7 +6435,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
6115
6435
  for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
6116
6436
  // copy all hashset keys (tensors) that are in use
6117
6437
  if (ggml_bitset_get(src->visited_hash_set.used, i)) {
6118
- ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
6438
+ size_t new_hash_pos = ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
6439
+ dst->use_counts[new_hash_pos] = src->use_counts[i];
6119
6440
  }
6120
6441
  }
6121
6442