@novastera-oss/llamarn 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  13. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  18. package/cpp/LlamaCppModel.cpp +56 -22
  19. package/cpp/build-info.cpp +2 -2
  20. package/cpp/llama.cpp/CMakeLists.txt +1 -2
  21. package/cpp/llama.cpp/README.md +4 -5
  22. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  23. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  24. package/cpp/llama.cpp/common/arg.cpp +24 -0
  25. package/cpp/llama.cpp/common/chat.cpp +37 -20
  26. package/cpp/llama.cpp/common/chat.h +2 -0
  27. package/cpp/llama.cpp/common/common.cpp +3 -0
  28. package/cpp/llama.cpp/common/common.h +5 -0
  29. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  30. package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
  31. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  32. package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
  33. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  34. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  35. package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
  36. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  100. package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
  101. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
  103. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
  104. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
  105. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  133. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  134. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  135. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  136. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  137. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
  138. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  141. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  142. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  144. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  168. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  169. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  170. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  172. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  173. package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
  177. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
  178. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
  179. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  180. package/cpp/llama.cpp/include/llama.h +8 -43
  181. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  182. package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
  183. package/cpp/llama.cpp/src/llama-arch.h +36 -1
  184. package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
  185. package/cpp/llama.cpp/src/llama-batch.h +105 -70
  186. package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
  187. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  188. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  189. package/cpp/llama.cpp/src/llama-context.h +13 -13
  190. package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
  191. package/cpp/llama.cpp/src/llama-graph.h +78 -79
  192. package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
  193. package/cpp/llama.cpp/src/llama-hparams.h +11 -0
  194. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
  195. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  196. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
  197. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
  198. package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
  199. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
  200. package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
  201. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
  202. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  203. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  204. package/cpp/llama.cpp/src/llama-memory.h +21 -22
  205. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  206. package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
  207. package/cpp/llama.cpp/src/llama-model.h +40 -0
  208. package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
  209. package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
  210. package/cpp/llama.cpp/src/llama-vocab.h +42 -0
  211. package/cpp/rn-utils.h +3 -0
  212. package/ios/include/chat.h +2 -0
  213. package/ios/include/common.h +5 -0
  214. package/ios/include/llama.h +8 -43
  215. package/ios/libs/llama.xcframework/Info.plist +19 -19
  216. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  217. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  218. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  219. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  220. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
  221. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  223. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  224. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  225. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  231. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  232. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  248. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  250. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
  254. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
  255. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  261. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  262. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  263. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  264. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  265. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  267. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  268. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
  269. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
  270. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
  274. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  275. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  276. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  277. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  278. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  279. package/package.json +1 -1
  280. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  315. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  316. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  317. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  318. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  319. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1024) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
8
- layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
9
- layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
10
-
11
- layout(push_constant) uniform PushConstants {
12
- uint inAOff;
13
- uint inBOff;
14
- uint outOff;
15
- int ne00;
16
- int nb00;
17
- int nb01;
18
- int nb02;
19
- int nb03;
20
- int ne10;
21
- int ne11;
22
- int ne12;
23
- int ne13;
24
- int nb10;
25
- int nb11;
26
- int nb12;
27
- int nb13;
28
- int ne0;
29
- int nb0;
30
- int nb1;
31
- int nb2;
32
- int nb3;
33
- } pcs;
34
-
35
- void main() {
36
- const uint i03 = gl_WorkGroupID.z;
37
- const uint i02 = gl_WorkGroupID.y;
38
- const uint i01 = gl_WorkGroupID.x;
39
-
40
- const uint i13 = i03 % pcs.ne13;
41
- const uint i12 = i02 % pcs.ne12;
42
- const uint i11 = i01 % pcs.ne11;
43
-
44
- uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
45
- uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
46
- uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
47
-
48
- for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
49
- const uint i10 = i0 % pcs.ne10;
50
- out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
51
- }
52
- }
@@ -1,69 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #extension GL_KHR_shader_subgroup_arithmetic : require
6
-
7
- layout(local_size_x_id = 0) in;
8
-
9
- layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
10
- layout (binding = 1) readonly buffer tensorInB { float inB[]; };
11
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
12
-
13
- layout (push_constant) uniform parameter {
14
- uint inAOff;
15
- uint inBOff;
16
- uint outOff;
17
- int ne00;
18
- int ne01;
19
- int ne02;
20
- uint nb00;
21
- uint nb01;
22
- uint nb02;
23
- uint nb03;
24
- int ne10;
25
- int ne11;
26
- int ne12;
27
- uint nb10;
28
- uint nb11;
29
- uint nb12;
30
- uint nb13;
31
- int ne0;
32
- int ne1;
33
- uint r2;
34
- uint r3;
35
- } pcs;
36
-
37
- #define N_F16_F32 4
38
-
39
- void main() {
40
- const uint r0 = gl_WorkGroupID.x;
41
- const uint rb = gl_WorkGroupID.y*N_F16_F32;
42
- const uint im = gl_WorkGroupID.z;
43
-
44
- const uint i12 = im%pcs.ne12;
45
- const uint i13 = im/pcs.ne12;
46
-
47
- const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
48
-
49
- const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
50
-
51
- for (uint row = 0; row < N_F16_F32; ++row) {
52
- uint r1 = rb + row;
53
- if (r1 >= pcs.ne11) {
54
- break;
55
- }
56
-
57
- const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
58
-
59
- float sumf = 0;
60
- for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
61
- sumf += float(inA[x+i]) * float(inB[y+i]);
62
- }
63
-
64
- const float all_sum = subgroupAdd(sumf);
65
- if (subgroupElect()) {
66
- out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
67
- }
68
- }
69
- }
@@ -1,51 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #extension GL_KHR_shader_subgroup_arithmetic : require
6
- #extension GL_EXT_debug_printf : enable
7
-
8
- // device subgroup size
9
- layout (local_size_x_id = 0) in;
10
-
11
- layout(binding = 0) readonly buffer tensorInA { float inA[]; };
12
- layout(binding = 1) readonly buffer tensorInB { float inB[]; };
13
- layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
14
-
15
- layout(push_constant) uniform parameter {
16
- uint inAOff;
17
- uint inBOff;
18
- uint outOff;
19
- int ne00;
20
- int ne01;
21
- int ne02;
22
- int ne11;
23
- int ne12;
24
- uint nb01;
25
- uint nb02;
26
- uint nb11;
27
- uint nb12;
28
- uint nb1;
29
- uint nb2;
30
- }
31
- pcs;
32
-
33
-
34
- void main() {
35
- uvec3 gid = gl_WorkGroupID;
36
-
37
- uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
38
- uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
39
-
40
- const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
41
- const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
42
- float sum = 0.0f;
43
- for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
44
- sum += float(inA[x+i]) * float(inB[y+i]);
45
- }
46
-
47
- const float all_sum = subgroupAdd(sum);
48
- if (subgroupElect()) {
49
- out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
50
- }
51
- }
@@ -1,33 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define BLOCKS_IN_QUANT QK4_0
6
- #define SIZE_OF_BLOCK sizeof_block_q4_0
7
- #define N_ROWS 4
8
-
9
- #include "op_mul_mv_q_n_pre.comp"
10
-
11
- // The q4_0 version of this function
12
- float block_q_n_dot_y(uint block_index, uint yb, uint il) {
13
- vec2 acc = vec2(0.0, 0.0);
14
- const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
15
- float d = float(u8BufToFloat16(inA, index));
16
- float sumy = 0.0f;
17
- for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
18
- const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
19
-
20
- const float yl0 = inB[yb + i];
21
- const float yl1 = inB[yb + i + 1];
22
- const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
23
- const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
24
-
25
- sumy += yl0 + yl1 + yl8 + yl9;
26
-
27
- acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
28
- acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
29
- }
30
- return d * (sumy * -8.f + acc[0] + acc[1]);
31
- }
32
-
33
- #include "op_mul_mv_q_n.comp"
@@ -1,35 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define BLOCKS_IN_QUANT QK4_1
6
- #define SIZE_OF_BLOCK sizeof_block_q4_1
7
- #define N_ROWS 4
8
-
9
- #include "op_mul_mv_q_n_pre.comp"
10
-
11
- // The q4_1 version of this function
12
- float block_q_n_dot_y(uint block_index, uint yb, uint il) {
13
- vec2 acc = vec2(0.0, 0.0);
14
- const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
15
- float d = float(u8BufToFloat16(inA, index));
16
- float m = float(u8BufToFloat16(inA, index+2));
17
-
18
- float sumy = 0.0f;
19
- for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
20
- const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
21
-
22
- const float yl0 = inB[yb + i];
23
- const float yl1 = inB[yb + i + 1];
24
- const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
25
- const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
26
-
27
- sumy += yl0 + yl1 + yl8 + yl9;
28
-
29
- acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
30
- acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
31
- }
32
- return d * (acc[0] + acc[1]) + sumy * m;
33
- }
34
-
35
- #include "op_mul_mv_q_n.comp"
@@ -1,140 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define N_DST 4
6
- #define SIZE_OF_BLOCK sizeof_block_q4_k
7
-
8
- layout(local_size_x = 4) in;
9
- layout(local_size_y = 8) in;
10
- layout(local_size_z = 1) in;
11
-
12
- layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
13
- layout (binding = 1) readonly buffer tensorInB { float inB[]; };
14
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
15
-
16
- layout (push_constant) uniform parameter {
17
- uint inAOff;
18
- uint inBOff;
19
- uint outOff;
20
- int ne00;
21
- int ne10;
22
- int ne0;
23
- int ne1;
24
- int ne01;
25
- int ne02;
26
- int ne12;
27
- uint nb01;
28
- uint nb02;
29
- uint nb03;
30
- uint nb11;
31
- uint nb12;
32
- uint nb13;
33
- uint r2;
34
- uint r3;
35
- } pcs;
36
-
37
- void main() {
38
- const uint16_t kmask1 = uint16_t(0x3f3f);
39
- const uint16_t kmask2 = uint16_t(0x0f0f);
40
- const uint16_t kmask3 = uint16_t(0xc0c0);
41
-
42
- const uint ix = gl_SubgroupInvocationID/8; // 0...3
43
- const uint it = gl_SubgroupInvocationID%8; // 0...7
44
- const uint iq = it/4; // 0 or 1
45
- const uint ir = it%4; // 0...3
46
-
47
- const uint nb = pcs.ne00/QK_K;
48
-
49
- const uint r0 = gl_WorkGroupID.x;
50
- const uint r1 = gl_WorkGroupID.y;
51
- const uint im = gl_WorkGroupID.z;
52
-
53
- const uint first_row = r0 * N_DST;
54
- const uint ib_row = first_row * nb;
55
-
56
- const uint i12 = im%pcs.ne12;
57
- const uint i13 = im/pcs.ne12;
58
-
59
- const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
60
- const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
61
-
62
- const uint xblk = offset0 + pcs.inAOff;
63
- const uint y = (offset1 / 4) + pcs.inBOff;
64
-
65
- float yl[16];
66
- float yh[16];
67
- float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
68
- float all_sum = 0.f;
69
-
70
- uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
71
-
72
- for (uint ib = ix; ib < nb; ib += 4) {
73
- const uint blk_idx = ib + xblk;
74
-
75
- float sumy[4] = {0.f, 0.f, 0.f, 0.f};
76
- for (int i = 0; i < 8; ++i) {
77
- yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0];
78
- yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
79
- yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
80
- yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
81
- }
82
-
83
- for (int row = 0; row < N_DST; row++) {
84
- uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
85
-
86
- uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
87
- uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
88
- uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
89
- uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
90
- uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
91
-
92
- uint16_t sc16[4];
93
- sc16[0] = sc_0 & kmask1;
94
- sc16[1] = sc_2 & kmask1;
95
- sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
96
- sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
97
-
98
- float acc1[4] = {0.f, 0.f, 0.f, 0.f};
99
- float acc2[4] = {0.f, 0.f, 0.f, 0.f};
100
- for (int i = 0; i < 8; i += 2) {
101
- uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
102
- uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
103
- acc1[0] += yl[i+0] * (q1 & 0x000F);
104
- acc1[1] += yl[i+1] * (q1 & 0x0F00);
105
- acc1[2] += yl[i+8] * (q1 & 0x00F0);
106
- acc1[3] += yl[i+9] * (q1 & 0xF000);
107
- acc2[0] += yh[i+0] * (q2 & 0x000F);
108
- acc2[1] += yh[i+1] * (q2 & 0x0F00);
109
- acc2[2] += yh[i+8] * (q2 & 0x00F0);
110
- acc2[3] += yh[i+9] * (q2 & 0xF000);
111
- }
112
-
113
- uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
114
- uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
115
- uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
116
- uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
117
- uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
118
- uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
119
- uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
120
- uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
121
-
122
- float dall = float(inA[blk_idx + row_idx].d);
123
- float dmin = float(inA[blk_idx + row_idx].dmin);
124
- sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
125
- (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
126
- (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
127
- (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
128
- dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
129
- }
130
-
131
- y4 += 4 * QK_K;
132
- }
133
-
134
- for (int row = 0; row < N_DST; ++row) {
135
- all_sum = subgroupAdd(sumf[row]);
136
- if (subgroupElect()) {
137
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
138
- }
139
- }
140
- }
@@ -1,106 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define SIZE_OF_BLOCK sizeof_block_q6_k
6
-
7
- layout(local_size_x_id = 0) in;
8
- layout(local_size_y_id = 1) in;
9
- layout(local_size_z = 1) in;
10
-
11
- layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12
- layout (binding = 1) readonly buffer tensorInB { float inB[]; };
13
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inAOff;
17
- uint inBOff;
18
- uint outOff;
19
- int ne00;
20
- int ne10;
21
- int ne0;
22
- int ne1;
23
- int ne01;
24
- int ne02;
25
- int ne12;
26
- uint nb01;
27
- uint nb02;
28
- uint nb03;
29
- uint nb11;
30
- uint nb12;
31
- uint nb13;
32
- uint r2;
33
- uint r3;
34
- } pcs;
35
-
36
- void main() {
37
- const uint8_t kmask1 = uint8_t(0x03);
38
- const uint8_t kmask2 = uint8_t(0x0C);
39
- const uint8_t kmask3 = uint8_t(0x30);
40
- const uint8_t kmask4 = uint8_t(0xC0);
41
-
42
- const uint nb = pcs.ne00/QK_K;
43
-
44
- const uint r0 = gl_WorkGroupID.x;
45
- const uint r1 = gl_WorkGroupID.y;
46
- const uint im = gl_WorkGroupID.z;
47
-
48
- const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
49
-
50
- const uint i12 = im%pcs.ne12;
51
- const uint i13 = im/pcs.ne12;
52
-
53
- const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
54
- const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
55
-
56
- float sumf = 0;
57
-
58
- // bits of invocation ID for gl_SubgroupSize=32:
59
- // x x x x x
60
- // 4 3 2 1 0
61
- // ( tid ) ix
62
- // ip ( il )
63
-
64
- const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
65
- const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
66
- const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
67
- const uint ip = tid/8; // first or second half of block (0 or 1)
68
- const uint il = tid%8; // each half has 8 parts, one per scale
69
- const uint n = 4; // 4 scales at a time (and 4 sums)
70
- const uint l0 = n*il; // offset into half-block, 0..28
71
- const uint is = 8*ip + l0/16; // 0, 1, 8, 9
72
-
73
- const uint y_offset = 128*ip + l0;
74
- const uint q_offset_l = 64*ip + l0;
75
- const uint q_offset_h = 32*ip + l0;
76
-
77
- for (uint i = ix; i < nb; i += block_stride) {
78
-
79
- const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
80
-
81
- const uint qlIndex = q_offset_l;
82
- const uint q2Index = qlIndex + QK_K/8;
83
- const uint qhIndex = q_offset_h;
84
- const uint y = yy + i * QK_K + y_offset;
85
-
86
- float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
87
- for (uint l = 0; l < n; ++l) {
88
- const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
89
- const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
90
- const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
91
-
92
- sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
93
- sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
94
- sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
95
- sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
96
- }
97
-
98
- float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
99
- sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
100
- }
101
-
102
- const float tot = subgroupAdd(sumf);
103
- if (subgroupElect()) {
104
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
105
- }
106
- }
@@ -1,73 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #include "op_mul_mv_q_n_pre.comp"
6
-
7
- #define SIZE_OF_D 2
8
-
9
- #define N_DST 4 // each SIMD group works on 4 rows
10
- #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
11
- #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
12
-
13
- #define NB_Q8_0 8
14
-
15
- void main() {
16
- // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
17
- if (gl_SubgroupInvocationID > 31)
18
- return;
19
-
20
- const int nr = N_DST;
21
- const int nsg = N_SIMDGROUP;
22
- const int nw = N_SIMDWIDTH;
23
-
24
- const int nb = pcs.ne00/QK8_0;
25
- const uint r0 = gl_WorkGroupID.x;
26
- const uint r1 = gl_WorkGroupID.y;
27
- const uint im = gl_WorkGroupID.z;
28
-
29
- const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
30
-
31
- const uint i12 = im%pcs.ne12;
32
- const uint i13 = im/pcs.ne12;
33
-
34
- const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
35
-
36
- const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
37
- const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
38
-
39
- float yl[NB_Q8_0];
40
- float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
41
-
42
- const uint ix = gl_SubgroupInvocationID.x/4;
43
- const uint il = gl_SubgroupInvocationID.x%4;
44
-
45
- uint yb = y + ix * QK8_0 + NB_Q8_0*il;
46
-
47
- // each thread in a SIMD group deals with NB_Q8_0 quants at a time
48
- for (uint ib = ix; ib < nb; ib += nw/4) {
49
- for (int i = 0; i < NB_Q8_0; ++i) {
50
- yl[i] = inB[yb + i];
51
- }
52
-
53
- for (int row = 0; row < nr; row++) {
54
- const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
55
- float sumq = 0.f;
56
- for (int iq = 0; iq < NB_Q8_0; ++iq) {
57
- const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
58
- sumq += qs_iq * yl[iq];
59
- }
60
- const float16_t d = u8BufToFloat16(inA, x + block_offset);
61
- sumf[row] += sumq*d;
62
- }
63
-
64
- yb += NB_Q8_0 * nw;
65
- }
66
-
67
- for (int row = 0; row < nr; ++row) {
68
- const float tot = subgroupAdd(sumf[row]);
69
- if (subgroupElect() && first_row + row < pcs.ne01) {
70
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
71
- }
72
- }
73
- }
@@ -1,52 +0,0 @@
1
- void main() {
2
- // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
3
- if (gl_SubgroupInvocationID > 31)
4
- return;
5
-
6
- const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
7
-
8
- const uint r0 = gl_WorkGroupID.x;
9
- const uint r1 = gl_WorkGroupID.y;
10
- const uint im = gl_WorkGroupID.z;
11
-
12
- const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
13
-
14
- const uint i12 = im%pcs.ne12;
15
- const uint i13 = im/pcs.ne12;
16
-
17
- // pointers to src0 rows
18
- uint ax[N_ROWS];
19
- for (int row = 0; row < N_ROWS; ++row) {
20
- const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
21
-
22
- ax[row] = offset0 + pcs.inAOff;
23
- }
24
-
25
- const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
26
-
27
- float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
28
-
29
- const uint ix = gl_SubgroupInvocationID/2;
30
- const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
31
-
32
- uint yb = y + ix * BLOCKS_IN_QUANT + il;
33
-
34
- //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
35
- // gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
36
- // gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
37
-
38
- for (uint ib = ix; ib < nb; ib += 16) {
39
- for (int row = 0; row < N_ROWS; row++) {
40
- sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
41
- }
42
-
43
- yb += BLOCKS_IN_QUANT * 16;
44
- }
45
-
46
- for (int row = 0; row < N_ROWS; ++row) {
47
- const float tot = subgroupAdd(sumf[row]);
48
- if (first_row + row < pcs.ne01 && subgroupElect()) {
49
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
50
- }
51
- }
52
- }
@@ -1,28 +0,0 @@
1
- layout(local_size_x_id = 0) in;
2
- layout(local_size_y = 8) in;
3
- layout(local_size_z = 1) in;
4
-
5
- layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
6
- layout (binding = 1) readonly buffer tensorInB { float inB[]; };
7
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
8
-
9
- layout (push_constant) uniform parameter {
10
- uint inAOff;
11
- uint inBOff;
12
- uint outOff;
13
- int ne00;
14
- int ne01;
15
- int ne02;
16
- int ne10;
17
- int ne12;
18
- int ne0;
19
- int ne1;
20
- uint nb01;
21
- uint nb02;
22
- uint nb03;
23
- uint nb11;
24
- uint nb12;
25
- uint nb13;
26
- uint r2;
27
- uint r3;
28
- } pcs;