@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -2,6 +2,7 @@
2
2
  #include "ggml-common.h"
3
3
 
4
4
  #include "ggml-cpu-impl.h"
5
+ #include "simd-mappings.h"
5
6
  #include "ggml-quants.h"
6
7
  #include "quants.h"
7
8
 
@@ -137,7 +138,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
137
138
  }
138
139
 
139
140
  int sumi = sumi0 + sumi1;
140
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
141
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
141
142
  }
142
143
 
143
144
  *s = sumf;
@@ -174,7 +175,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
174
175
  }
175
176
 
176
177
  int sumi = sumi0 + sumi1;
177
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
178
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
178
179
  }
179
180
 
180
181
  *s = sumf;
@@ -217,7 +218,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
217
218
  }
218
219
 
219
220
  int sumi = sumi0 + sumi1;
220
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
221
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
221
222
  }
222
223
 
223
224
  *s = sumf;
@@ -260,7 +261,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
260
261
  }
261
262
 
262
263
  int sumi = sumi0 + sumi1;
263
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
264
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
264
265
  }
265
266
 
266
267
  *s = sumf;
@@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
290
291
  sumi += x[ib].qs[j]*y[ib].qs[j];
291
292
  }
292
293
 
293
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
294
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
294
295
  }
295
296
 
296
297
  *s = sumf;
@@ -342,7 +343,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
342
343
  }
343
344
  }
344
345
 
345
- sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
346
+ sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
346
347
  }
347
348
 
348
349
  *s = sumf;
@@ -372,7 +373,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
372
373
  }
373
374
  }
374
375
 
375
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
376
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
376
377
 
377
378
  sumf += (float) sumi * d;
378
379
  }
@@ -405,8 +406,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
405
406
  summs += y[i].bsums[j] * (sc[j] >> 4);
406
407
  }
407
408
 
408
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
409
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
409
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
410
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
410
411
 
411
412
  int isum = 0;
412
413
  int is = 0;
@@ -504,7 +505,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
504
505
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
505
506
  q8 += 8; a += 8;
506
507
  }
507
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
508
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
508
509
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
509
510
  }
510
511
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -577,9 +578,9 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
577
578
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
578
579
  q8 += 8; a += 8;
579
580
  }
580
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
581
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
581
582
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
582
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
583
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
583
584
  sumf -= dmin * sumi;
584
585
  }
585
586
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -657,9 +658,9 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
657
658
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
658
659
  q8 += 8; a += 8;
659
660
  }
660
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
661
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
661
662
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
662
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
663
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
663
664
  sumf -= dmin * sumi;
664
665
  }
665
666
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -714,7 +715,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
714
715
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
715
716
  q8 += 8; a += 8;
716
717
  }
717
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
718
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
718
719
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
719
720
  }
720
721
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -739,7 +740,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
739
740
 
740
741
  float sumf = 0.f;
741
742
  for (int i = 0; i < nb; ++i) {
742
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
743
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
743
744
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
744
745
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
745
746
  int32_t bsum = 0;
@@ -778,7 +779,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
778
779
 
779
780
  float sumf = 0.f;
780
781
  for (int i = 0; i < nb; ++i) {
781
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
782
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
782
783
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
783
784
  const uint8_t * GGML_RESTRICT sc = x[i].scales;
784
785
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -829,7 +830,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
829
830
  float sumf = 0;
830
831
  for (int i = 0; i < nb; i++) {
831
832
 
832
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
833
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
833
834
  const int8_t * q8 = y[i].qs;
834
835
  const uint8_t * qs = x[i].qs;
835
836
  const uint8_t * qh = x[i].qh;
@@ -882,7 +883,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
882
883
 
883
884
  float sumf = 0.f;
884
885
  for (int i = 0; i < nb; ++i) {
885
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
886
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
886
887
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
887
888
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
888
889
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -924,7 +925,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
924
925
 
925
926
  float sumf = 0.f;
926
927
  for (int i = 0; i < nb; ++i) {
927
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
928
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
928
929
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
929
930
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
930
931
  const uint8_t * GGML_RESTRICT signs = x[i].signs;
@@ -1002,7 +1003,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
1002
1003
  qs += 4;
1003
1004
  }
1004
1005
 
1005
- sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
1006
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
1006
1007
  }
1007
1008
 
1008
1009
  *s = sumf;
@@ -1063,7 +1064,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
1063
1064
  qh += 2;
1064
1065
  }
1065
1066
 
1066
- sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
1067
+ sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
1067
1068
  }
1068
1069
 
1069
1070
  *s = sumf;
@@ -1087,7 +1088,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
1087
1088
  float sumf = 0;
1088
1089
 
1089
1090
  for (; ib < nb; ++ib) {
1090
- const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
1091
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
1091
1092
  int sumi1 = 0, sumi2 = 0;
1092
1093
  for (int j = 0; j < QK4_NL/2; ++j) {
1093
1094
  sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@@ -1113,7 +1114,7 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
1113
1114
 
1114
1115
  float sumf = 0;
1115
1116
  for (int ibl = 0; ibl < nb; ++ibl) {
1116
- const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1117
+ const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1117
1118
  uint16_t h = x[ibl].scales_h;
1118
1119
  const uint8_t * qs = x[ibl].qs;
1119
1120
  const int8_t * q8 = y[ibl].qs;
@@ -6,6 +6,7 @@
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
8
  #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
9
10
  #include "traits.h"
10
11
 
11
12
  #include "arch-fallback.h"
@@ -72,7 +73,7 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG
72
73
  const float d = amax / ((1 << 7) - 1);
73
74
  id[row_iter] = d ? 1.0f / d : 0.0f;
74
75
 
75
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
76
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
76
77
  }
77
78
 
78
79
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -110,7 +111,7 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
110
111
  const float d = amax / ((1 << 7) - 1);
111
112
  id[row_iter] = d ? 1.0f / d : 0.0f;
112
113
 
113
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
114
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
114
115
  }
115
116
 
116
117
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -236,7 +237,7 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
236
237
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
237
238
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
238
239
  }
239
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
240
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
240
241
  }
241
242
  }
242
243
  }
@@ -280,7 +281,7 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
280
281
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
281
282
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
282
283
  }
283
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
284
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
284
285
  }
285
286
  }
286
287
  }
@@ -325,7 +326,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
325
326
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
327
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
327
328
  }
328
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
329
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
329
330
  }
330
331
  }
331
332
  }
@@ -396,13 +397,13 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
396
397
  sumi2 = sumi2 * scales_1[j];
397
398
  sumi += sumi1 + sumi2;
398
399
  }
399
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
400
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
400
401
  }
401
402
  }
402
403
  for (int sb = 0; sb < 8; sb++) {
403
404
  uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
404
405
  for (int j = 0; j < ncols_interleaved; j++) {
405
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
406
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
406
407
  }
407
408
  }
408
409
  }
@@ -449,7 +450,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
449
450
  const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
450
451
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
451
452
  }
452
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
453
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
453
454
  }
454
455
  }
455
456
  }
@@ -500,7 +501,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
500
501
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
501
502
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
502
503
  }
503
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
504
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
504
505
  }
505
506
  }
506
507
  }
@@ -555,7 +556,7 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
555
556
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
556
557
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
557
558
  }
558
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
559
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
559
560
  }
560
561
  }
561
562
  }
@@ -609,7 +610,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
609
610
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
610
611
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
611
612
  }
612
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
613
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
613
614
  }
614
615
  }
615
616
  }
@@ -688,7 +689,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
688
689
  sumi2 = sumi2 * scales_1[j];
689
690
  sumi += sumi1 + sumi2;
690
691
  }
691
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
692
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
692
693
  }
693
694
  }
694
695
  }
@@ -697,7 +698,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
697
698
  for(int m = 0; m < 4; m++) {
698
699
  const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
699
700
  for(int j = 0; j < ncols_interleaved; j++) {
700
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
701
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
701
702
  }
702
703
  }
703
704
  }
@@ -753,7 +754,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
753
754
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
754
755
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
755
756
  }
756
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
757
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
757
758
  }
758
759
  }
759
760
  }
@@ -1163,13 +1164,24 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1163
1164
  // not realy a GGML_TYPE_Q8_0 but same size.
1164
1165
  switch (op->op) {
1165
1166
  case GGML_OP_MUL_MAT:
1166
- size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1167
- return true;
1167
+ {
1168
+ size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1169
+ return true;
1170
+ }
1168
1171
  case GGML_OP_MUL_MAT_ID:
1169
- size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1170
- size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
1171
- size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
1172
- return true;
1172
+ {
1173
+ size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
1174
+ size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
1175
+
1176
+ const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
1177
+ const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
1178
+
1179
+ const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
1180
+
1181
+ size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
1182
+
1183
+ return true;
1184
+ }
1173
1185
  default:
1174
1186
  // GGML_ABORT("fatal error");
1175
1187
  break;
@@ -1305,14 +1317,17 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1305
1317
  int32_t i2;
1306
1318
  };
1307
1319
 
1308
- GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
1309
- n_as * ne12 * sizeof(mmid_row_mapping)));
1320
+ GGML_ASSERT(params->wsize >=
1321
+ (GGML_PAD(nbw3, sizeof(int64_t)) +
1322
+ n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
1323
+ );
1310
1324
 
1311
- auto * wdata = (char *) params->wdata;
1312
- auto * wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t));
1313
- auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
1325
+ auto * wdata = (char *)params->wdata;
1326
+ auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
1314
1327
 
1315
- struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
1328
+ // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
1329
+ auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
1330
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
1316
1331
 
1317
1332
  // src1: float32 => param type
1318
1333
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -1397,44 +1412,45 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
1397
1412
  }
1398
1413
  };
1399
1414
 
1400
- // instance for Q4
1401
- static const tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
1402
- static const tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
1403
- static const tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
1404
- static const tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1405
-
1406
- // instance for IQ4
1407
- static const tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1408
-
1409
1415
  } // namespace ggml::cpu::repack
1410
1416
 
1411
1417
  static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
1418
+
1419
+ // instance for Q4
1420
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
1421
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
1422
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
1423
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
1424
+
1425
+ // instance for IQ4
1426
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1427
+
1412
1428
  if (cur->type == GGML_TYPE_Q4_0) {
1413
1429
  if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
1414
1430
  if (cur->ne[1] % 8 == 0) {
1415
- return &ggml::cpu::repack::q4_0_8x8_q8_0;
1431
+ return &q4_0_8x8_q8_0;
1416
1432
  }
1417
1433
  }
1418
1434
  if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
1419
1435
  if (cur->ne[1] % 4 == 0) {
1420
- return &ggml::cpu::repack::q4_0_4x8_q8_0;
1436
+ return &q4_0_4x8_q8_0;
1421
1437
  }
1422
1438
  }
1423
1439
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1424
1440
  if (cur->ne[1] % 4 == 0) {
1425
- return &ggml::cpu::repack::q4_0_4x4_q8_0;
1441
+ return &q4_0_4x4_q8_0;
1426
1442
  }
1427
1443
  }
1428
1444
  } else if (cur->type == GGML_TYPE_Q4_K) {
1429
1445
  if (ggml_cpu_has_avx2()) {
1430
1446
  if (cur->ne[1] % 8 == 0) {
1431
- return &ggml::cpu::repack::q4_K_8x8_q8_K;
1447
+ return &q4_K_8x8_q8_K;
1432
1448
  }
1433
1449
  }
1434
1450
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
1435
1451
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1436
1452
  if (cur->ne[1] % 4 == 0) {
1437
- return &ggml::cpu::repack::iq4_nl_4x4_q8_0;
1453
+ return &iq4_nl_4x4_q8_0;
1438
1454
  }
1439
1455
  }
1440
1456
  }