@novastera-oss/llamarn 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/proguard-rules.pro +12 -0
  3. package/android/src/main/cpp/include/llama.h +15 -47
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakePresets.json +11 -0
  22. package/cpp/llama.cpp/CODEOWNERS +1 -0
  23. package/cpp/llama.cpp/README.md +4 -3
  24. package/cpp/llama.cpp/common/arg.cpp +45 -1
  25. package/cpp/llama.cpp/common/common.cpp +22 -6
  26. package/cpp/llama.cpp/common/common.h +18 -4
  27. package/cpp/llama.cpp/convert_hf_to_gguf.py +500 -32
  28. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +12 -13
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -1
  30. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
  31. package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  32. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
  34. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -0
  35. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +8 -20
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +58 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +122 -16
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +3 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +14 -4
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +64 -17
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -67
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +45 -62
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +28 -43
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +41 -56
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -47
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +31 -43
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +22 -37
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +73 -23
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -689
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +7 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +13 -1
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-impl.h +16 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +13 -3
  77. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +407 -69
  78. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +380 -83
  79. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +295 -2
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +131 -46
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +43 -43
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +287 -22
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +1 -5
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +8 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +71 -16
  109. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
  115. package/cpp/llama.cpp/ggml/src/ggml.c +4 -6
  116. package/cpp/llama.cpp/gguf-py/gguf/constants.py +98 -0
  117. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
  118. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +75 -52
  120. package/cpp/llama.cpp/include/llama.h +15 -7
  121. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
  122. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
  123. package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
  124. package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
  125. package/cpp/llama.cpp/src/llama-arch.cpp +106 -0
  126. package/cpp/llama.cpp/src/llama-arch.h +5 -0
  127. package/cpp/llama.cpp/src/llama-batch.cpp +76 -70
  128. package/cpp/llama.cpp/src/llama-batch.h +24 -18
  129. package/cpp/llama.cpp/src/llama-chat.cpp +43 -1
  130. package/cpp/llama.cpp/src/llama-chat.h +2 -0
  131. package/cpp/llama.cpp/src/llama-context.cpp +180 -106
  132. package/cpp/llama.cpp/src/llama-context.h +26 -16
  133. package/cpp/llama.cpp/src/llama-cparams.h +3 -2
  134. package/cpp/llama.cpp/src/llama-graph.cpp +203 -39
  135. package/cpp/llama.cpp/src/llama-graph.h +147 -72
  136. package/cpp/llama.cpp/src/llama-hparams.cpp +40 -0
  137. package/cpp/llama.cpp/src/llama-hparams.h +10 -2
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  139. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  140. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  141. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  142. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +16 -1
  144. package/cpp/llama.cpp/src/llama-model.cpp +1293 -312
  145. package/cpp/llama.cpp/src/llama-model.h +3 -4
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -2
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +363 -8
  148. package/cpp/llama.cpp/src/llama-vocab.h +2 -0
  149. package/cpp/llama.cpp/src/unicode.cpp +207 -0
  150. package/cpp/llama.cpp/src/unicode.h +2 -0
  151. package/ios/include/common.h +18 -4
  152. package/ios/include/llama.h +15 -7
  153. package/ios/libs/llama.xcframework/Info.plist +15 -15
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  155. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -7
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  158. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  165. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3891
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -7
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -7
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -7
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  172. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
  174. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -7
  175. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  176. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  177. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  178. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
  179. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  180. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -5095
  183. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -7
  184. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  186. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -5066
  187. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3919
  188. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  189. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  190. package/package.json +4 -4
@@ -4015,6 +4015,9 @@ static void ggml_compute_forward_rms_norm_f32(
4015
4015
 
4016
4016
  const float scale = 1.0f/sqrtf(mean + eps);
4017
4017
 
4018
+ // if you hit this, likely you got an inf somewhere earlier
4019
+ assert(scale > 0.0f);
4020
+
4018
4021
  ggml_vec_scale_f32(ne00, y, scale);
4019
4022
  }
4020
4023
  }
@@ -14,7 +14,6 @@
14
14
  #include <cmath>
15
15
  #include <cstring>
16
16
  #include <cassert>
17
- #include <cstdlib> // for qsort
18
17
  #include <cstdio> // for GGML_ASSERT
19
18
 
20
19
  #include "repack.h"
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
221
221
  for (int i = np; i < n; ++i) {
222
222
  sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
223
223
  }
224
+
225
+ // if you hit this, you are likely running outside the FP range
226
+ assert(!isnan(sumf) && !isinf(sumf));
224
227
  #else
225
228
  for (int i = 0; i < n; ++i) {
226
229
  sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
@@ -102,12 +102,12 @@ if (CUDAToolkit_FOUND)
102
102
  if (GGML_STATIC)
103
103
  if (WIN32)
104
104
  # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
105
- target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
105
+ target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
106
106
  else ()
107
- target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
107
+ target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static)
108
108
  endif()
109
109
  else()
110
- target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
110
+ target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
111
111
  endif()
112
112
 
113
113
  if (GGML_CUDA_NO_VMM)
@@ -56,7 +56,7 @@
56
56
  #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
57
57
  #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
58
58
  #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
59
- #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
59
+ #define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
60
60
  #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
61
61
  #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
62
62
 
@@ -72,8 +72,9 @@
72
72
  #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
73
73
  #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
74
74
  #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
75
- #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
76
- #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
75
+ #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
76
+ #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
77
+ #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
77
78
 
78
79
  // Moore Threads
79
80
  #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
@@ -226,6 +227,10 @@ typedef float2 dfloat2;
226
227
  #define FP16_MMA_AVAILABLE
227
228
  #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
228
229
 
230
+ #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
231
+ #define AMD_MFMA_AVAILABLE
232
+ #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
233
+
229
234
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
230
235
  #define NEW_MMA_AVAILABLE
231
236
  #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -288,6 +293,11 @@ static bool fp32_mma_hardware_available(const int cc) {
288
293
  return GGML_CUDA_CC_IS_CDNA(cc);
289
294
  }
290
295
 
296
+ // AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
297
+ static bool amd_mfma_available(const int cc) {
298
+ return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
299
+ }
300
+
291
301
  // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
292
302
  static bool new_mma_available(const int cc) {
293
303
  return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
@@ -765,7 +775,7 @@ struct ggml_tensor_extra_gpu {
765
775
  };
766
776
 
767
777
 
768
- #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
778
+ #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
769
779
  #define USE_CUDA_GRAPH
770
780
  #endif
771
781
 
@@ -6,24 +6,33 @@
6
6
  #define CUDA_Q8_0_NE_ALIGN 2048
7
7
 
8
8
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
9
- static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
10
- const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
9
+ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
10
+ const int64_t ne00, const int64_t ne01, const int64_t ne02,
11
+ const int64_t s01, const int64_t s02, const int64_t s03) {
12
+ const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);
11
13
 
12
- if (i >= k) {
14
+ if (i00 >= ne00) {
13
15
  return;
14
16
  }
15
17
 
16
- const int64_t ib = i/qk; // block index
17
- const int64_t iqs = (i%qk)/qr; // quant index
18
- const int64_t iybs = i - i%qk; // y block start index
18
+ const int64_t i01 = blockIdx.y;
19
+ const int64_t i02 = blockIdx.z % ne02;
20
+ const int64_t i03 = blockIdx.z / ne02;
21
+
22
+ const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
23
+
24
+ const int64_t ib = ibx0 + i00/qk; // block index
25
+ const int64_t iqs = (i00%qk)/qr; // quant index
26
+ const int64_t iybs = i00 - i00%qk; // y block start index
19
27
  const int64_t y_offset = qr == 1 ? 1 : qk/2;
20
28
 
21
29
  // dequantize
22
30
  dfloat2 v;
23
31
  dequantize_kernel(vx, ib, iqs, v);
24
32
 
25
- y[iybs + iqs + 0] = v.x;
26
- y[iybs + iqs + y_offset] = v.y;
33
+ const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
34
+ y[iy0 + 0] = float(v.x);
35
+ y[iy0 + y_offset] = float(v.y);
27
36
  }
28
37
 
29
38
  template <bool need_check>
@@ -457,9 +466,17 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
457
466
  }
458
467
 
459
468
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
460
- static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
461
- const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
462
- dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
469
+ static void dequantize_block_cuda(const void * vx, dst_t * y,
470
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
471
+ const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
472
+ const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03);
473
+ dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
474
+ (vx, y, ne00, ne01, ne02, s01, s02, s03);
475
+ }
476
+
477
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
478
+ static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
479
+ dequantize_block_cuda<qk, qr, dequantize_kernel, dst_t>(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream);
463
480
  }
464
481
 
465
482
  static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
@@ -624,14 +641,14 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
624
641
  case GGML_TYPE_Q4_1:
625
642
  return dequantize_row_q4_1_cuda;
626
643
  case GGML_TYPE_Q5_0:
627
- return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
644
+ return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
628
645
  case GGML_TYPE_Q5_1:
629
- return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
646
+ return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
630
647
  case GGML_TYPE_Q8_0:
631
648
  if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
632
649
  return dequantize_block_q8_0_f16_cuda;
633
650
  }
634
- return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
651
+ return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
635
652
  case GGML_TYPE_Q2_K:
636
653
  return dequantize_row_q2_K_cuda;
637
654
  case GGML_TYPE_Q3_K:
@@ -676,11 +693,11 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
676
693
  case GGML_TYPE_Q4_1:
677
694
  return dequantize_row_q4_1_cuda;
678
695
  case GGML_TYPE_Q5_0:
679
- return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
696
+ return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
680
697
  case GGML_TYPE_Q5_1:
681
- return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
698
+ return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
682
699
  case GGML_TYPE_Q8_0:
683
- return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
700
+ return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
684
701
  case GGML_TYPE_Q2_K:
685
702
  return dequantize_row_q2_K_cuda;
686
703
  case GGML_TYPE_Q3_K:
@@ -722,6 +739,16 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
722
739
  switch (type) {
723
740
  case GGML_TYPE_F32:
724
741
  return convert_unary_cuda<float>;
742
+ case GGML_TYPE_Q4_0:
743
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
744
+ case GGML_TYPE_Q4_1:
745
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
746
+ case GGML_TYPE_Q5_0:
747
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
748
+ case GGML_TYPE_Q5_1:
749
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
750
+ case GGML_TYPE_Q8_0:
751
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
725
752
  case GGML_TYPE_BF16:
726
753
  return convert_unary_cuda<nv_bfloat16>;
727
754
  default:
@@ -733,6 +760,16 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
733
760
  switch (type) {
734
761
  case GGML_TYPE_F32:
735
762
  return convert_unary_cuda<float, nv_bfloat16>;
763
+ case GGML_TYPE_Q4_0:
764
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
765
+ case GGML_TYPE_Q4_1:
766
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
767
+ case GGML_TYPE_Q5_0:
768
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
769
+ case GGML_TYPE_Q5_1:
770
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
771
+ case GGML_TYPE_Q8_0:
772
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
736
773
  case GGML_TYPE_F16:
737
774
  return convert_unary_cuda<half, nv_bfloat16>;
738
775
  default:
@@ -744,6 +781,16 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
744
781
  switch (type) {
745
782
  case GGML_TYPE_F16:
746
783
  return convert_unary_cuda<half, float>;
784
+ case GGML_TYPE_Q4_0:
785
+ return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
786
+ case GGML_TYPE_Q4_1:
787
+ return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
788
+ case GGML_TYPE_Q5_0:
789
+ return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
790
+ case GGML_TYPE_Q5_1:
791
+ return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
792
+ case GGML_TYPE_Q8_0:
793
+ return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
747
794
  case GGML_TYPE_BF16:
748
795
  return convert_unary_cuda<nv_bfloat16, float>;
749
796
  default:
@@ -0,0 +1,225 @@
1
+ #pragma once
2
+
3
+ #include "ggml-common.h"
4
+
5
+ template<typename src_t, typename dst_t>
6
+ static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
7
+ if constexpr (std::is_same_v<src_t, dst_t>) {
8
+ *dst = *src;
9
+ } else {
10
+ *dst = float(*src);
11
+ }
12
+ }
13
+
14
+ static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
15
+ if (x <= val[0]) return 0;
16
+ if (x >= val[n-1]) return n-1;
17
+ int ml = 0, mu = n-1;
18
+ while (mu-ml > 1) {
19
+ int mav = (ml+mu)/2;
20
+ if (x < val[mav]) mu = mav; else ml = mav;
21
+ }
22
+ return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
23
+ }
24
+
25
+ static __device__ void quantize_f32_q4_0_block(const float * __restrict__ x, block_q4_0 * __restrict__ y) {
26
+ float amax = 0.0f;
27
+ float vmax = 0.0f;
28
+
29
+ for (int j = 0; j < QK4_0; ++j) {
30
+ const float v = x[j];
31
+ if (amax < fabsf(v)) {
32
+ amax = fabsf(v);
33
+ vmax = v;
34
+ }
35
+ }
36
+
37
+ const float d = vmax / -8;
38
+ const float id = d ? 1.0f/d : 0.0f;
39
+
40
+ y->d = d;
41
+
42
+ for (int j = 0; j < QK4_0/2; ++j) {
43
+ const float x0 = x[0 + j]*id;
44
+ const float x1 = x[QK4_0/2 + j]*id;
45
+
46
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
47
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
48
+
49
+ y->qs[j] = xi0;
50
+ y->qs[j] |= xi1 << 4;
51
+ }
52
+ }
53
+
54
+ static __device__ void quantize_f32_q4_1_block(const float * __restrict__ x, block_q4_1 * __restrict__ y) {
55
+ float vmin = FLT_MAX;
56
+ float vmax = -FLT_MAX;
57
+
58
+ for (int j = 0; j < QK4_1; ++j) {
59
+ const float v = x[j];
60
+ if (v < vmin) vmin = v;
61
+ if (v > vmax) vmax = v;
62
+ }
63
+
64
+ const float d = (vmax - vmin) / ((1 << 4) - 1);
65
+ const float id = d ? 1.0f/d : 0.0f;
66
+
67
+ y->dm.x = d;
68
+ y->dm.y = vmin;
69
+
70
+ for (int j = 0; j < QK4_1/2; ++j) {
71
+ const float x0 = (x[0 + j] - vmin)*id;
72
+ const float x1 = (x[QK4_1/2 + j] - vmin)*id;
73
+
74
+ const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
75
+ const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
76
+
77
+ y->qs[j] = xi0;
78
+ y->qs[j] |= xi1 << 4;
79
+ }
80
+ }
81
+
82
+ static __device__ void quantize_f32_q5_0_block(const float * __restrict__ x, block_q5_0 * __restrict__ y) {
83
+ float amax = 0.0f;
84
+ float vmax = 0.0f;
85
+
86
+ for (int j = 0; j < QK5_0; ++j) {
87
+ const float v = x[j];
88
+ if (amax < fabsf(v)) {
89
+ amax = fabsf(v);
90
+ vmax = v;
91
+ }
92
+ }
93
+
94
+ const float d = vmax / -16;
95
+ const float id = d ? 1.0f/d : 0.0f;
96
+
97
+ y->d = d;
98
+
99
+ uint32_t qh = 0;
100
+ for (int j = 0; j < QK5_0/2; ++j) {
101
+ const float x0 = x[0 + j]*id;
102
+ const float x1 = x[QK5_0/2 + j]*id;
103
+
104
+ const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
105
+ const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
106
+
107
+ y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
108
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
109
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
110
+ }
111
+ memcpy(y->qh, &qh, sizeof(qh));
112
+ }
113
+
114
+ static __device__ void quantize_f32_q5_1_block(const float * __restrict__ x, block_q5_1 * __restrict__ y) {
115
+ float min = x[0];
116
+ float max = x[0];
117
+
118
+ for (int j = 1; j < QK5_1; ++j) {
119
+ const float v = x[j];
120
+ min = v < min ? v : min;
121
+ max = v > max ? v : max;
122
+ }
123
+
124
+ const float d = (max - min) / 31;
125
+ const float id = d ? 1.0f/d : 0.0f;
126
+
127
+ y->dm.x = d;
128
+ y->dm.y = min;
129
+
130
+ uint32_t qh = 0;
131
+ for (int j = 0; j < QK5_1/2; ++j) {
132
+ const float x0 = (x[0 + j] - min)*id;
133
+ const float x1 = (x[QK5_1/2 + j] - min)*id;
134
+
135
+ const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
136
+ const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
137
+
138
+ y->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
139
+ qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
140
+ qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
141
+ }
142
+ memcpy(y->qh, &qh, sizeof(qh));
143
+ }
144
+
145
+ static __device__ void quantize_f32_q8_0_block(const float * __restrict__ x, block_q8_0 * __restrict__ y) {
146
+ float amax = 0.0f; // absolute max
147
+
148
+ for (int j = 0; j < QK8_0; j++) {
149
+ const float v = x[j];
150
+ amax = fmaxf(amax, fabsf(v));
151
+ }
152
+
153
+ const float d = amax / ((1 << 7) - 1);
154
+ const float id = d ? 1.0f/d : 0.0f;
155
+
156
+ y->d = d;
157
+
158
+ for (int j = 0; j < QK8_0; ++j) {
159
+ const float x0 = x[j]*id;
160
+ y->qs[j] = roundf(x0);
161
+ }
162
+ }
163
+
164
+ static __device__ void quantize_f32_iq4_nl_block(const float * __restrict__ x, block_iq4_nl * __restrict__ y) {
165
+ float amax = 0.0f;
166
+ float vmax = 0.0f;
167
+
168
+ for (int j = 0; j < QK4_NL; ++j) {
169
+ const float v = x[j];
170
+ if (amax < fabsf(v)) {
171
+ amax = fabsf(v);
172
+ vmax = v;
173
+ }
174
+ }
175
+
176
+ float d = vmax / kvalues_iq4nl[0];
177
+ const float id = d ? 1.0f/d : 0.0f;
178
+
179
+ float sumqx = 0, sumq2 = 0;
180
+ for (int j = 0; j < QK4_NL/2; ++j) {
181
+ const float x0 = x[0 + j]*id;
182
+ const float x1 = x[QK4_NL/2 + j]*id;
183
+ const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
184
+ const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
185
+ y->qs[j] = xi0 | (xi1 << 4);
186
+ const float v0 = kvalues_iq4nl[xi0];
187
+ const float v1 = kvalues_iq4nl[xi1];
188
+ const float w0 = x[0 + j]*x[0 + j];
189
+ const float w1 = x[QK4_NL/2 + j]*x[QK4_NL/2 + j];
190
+ sumqx += w0*v0*x[j] + w1*v1*x[QK4_NL/2 + j];
191
+ sumq2 += w0*v0*v0 + w1*v1*v1;
192
+ }
193
+
194
+ y->d = sumq2 > 0 ? sumqx/sumq2 : d;
195
+ }
196
+
197
+ // Wrapper functions for cpy.cu compatibility
198
+ static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
199
+ quantize_f32_q4_0_block((const float *)cxi, (block_q4_0 *)cdsti);
200
+ }
201
+
202
+ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
203
+ quantize_f32_q4_1_block((const float *)cxi, (block_q4_1 *)cdsti);
204
+ }
205
+
206
+ static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
207
+ quantize_f32_q5_0_block((const float *)cxi, (block_q5_0 *)cdsti);
208
+ }
209
+
210
+ static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
211
+ quantize_f32_q5_1_block((const float *)cxi, (block_q5_1 *)cdsti);
212
+ }
213
+
214
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
215
+ quantize_f32_q8_0_block((const float *)cxi, (block_q8_0 *)cdsti);
216
+ }
217
+
218
+ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
219
+ quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
220
+ }
221
+
222
+ template<typename src_t, typename dst_t>
223
+ static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
224
+ convert_flt((const src_t *)cxi, (dst_t *)cdsti);
225
+ }