@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -6,6 +6,7 @@
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
8
  #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
9
10
  #include "traits.h"
10
11
 
11
12
  #include <cmath>
@@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
39
40
  float tmp[16];
40
41
 
41
42
  for (int i = 0; i < 8; i++) {
42
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
43
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
43
44
  }
44
45
 
45
46
  for (int i = 0; i < 8; i++) {
46
- tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
47
+ tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]);
47
48
  }
48
49
 
49
50
  return _mm512_loadu_ps(tmp);
@@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
54
55
  _mm_storeu_si128((__m128i*)tmphalf, x);
55
56
 
56
57
  for (int i = 0; i < 4; i++) {
57
- tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
58
- tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
59
- tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
60
- tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
58
+ tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
59
+ tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
60
+ tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
61
+ tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
61
62
  }
62
63
 
63
64
  return _mm512_loadu_ps(tmp);
@@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
67
68
  float tmp[8];
68
69
 
69
70
  for (int i = 0; i < 8; i++) {
70
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
71
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
71
72
  }
72
73
 
73
74
  return _mm256_loadu_ps(tmp);
@@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
76
77
  float tmp[8];
77
78
 
78
79
  for (int i = 0; i < 4; i++) {
79
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
80
- tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
80
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
81
+ tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]);
81
82
  }
82
83
 
83
84
  return _mm256_loadu_ps(tmp);
@@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
88
89
 
89
90
  _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
90
91
  for (int i = 0; i < 8; i++) {
91
- tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
92
+ tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
92
93
  }
93
94
 
94
95
  return _mm256_loadu_ps(tmp);
@@ -211,7 +212,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
211
212
  id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
212
213
 
213
214
  // Store the scale for the individual block
214
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
215
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
215
216
 
216
217
  // Store the values in blocks of eight values - Aim is to use these later for block interleaving
217
218
  srcv[row_iter][0] = v0;
@@ -297,7 +298,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
297
298
  const float d = amax / ((1 << 7) - 1);
298
299
  id[row_iter] = d ? 1.0f / d : 0.0f;
299
300
 
300
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
301
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
301
302
  }
302
303
 
303
304
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -647,7 +648,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
647
648
  const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
648
649
 
649
650
  // Load and convert to FP32 scale from block_q8_0
650
- const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
651
+ const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
651
652
 
652
653
  // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
653
654
  __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
@@ -706,7 +707,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
706
707
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
707
708
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
708
709
  }
709
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
710
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
710
711
  }
711
712
  }
712
713
  }
@@ -972,13 +973,13 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
972
973
  sumi2 = sumi2 * scales_1[j];
973
974
  sumi += sumi1 + sumi2;
974
975
  }
975
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
976
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
976
977
  }
977
978
  }
978
979
  for (int sb = 0; sb < 8; sb++) {
979
980
  uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
980
981
  for (int j = 0; j < ncols_interleaved; j++) {
981
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
982
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
982
983
  }
983
984
  }
984
985
  }
@@ -1755,7 +1756,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1755
1756
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1756
1757
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1757
1758
  }
1758
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1759
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1759
1760
  }
1760
1761
  }
1761
1762
  }
@@ -3259,7 +3260,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3259
3260
  sumi2 = sumi2 * scales_1[j];
3260
3261
  sumi += sumi1 + sumi2;
3261
3262
  }
3262
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
3263
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
3263
3264
  }
3264
3265
  }
3265
3266
  }
@@ -3268,7 +3269,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3268
3269
  for(int m = 0; m < 4; m++) {
3269
3270
  const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
3270
3271
  for(int j = 0; j < ncols_interleaved; j++) {
3271
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
3272
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
3272
3273
  }
3273
3274
  }
3274
3275
  }
@@ -4,6 +4,7 @@
4
4
  #include "traits.h"
5
5
  #include "ggml-cpu-impl.h"
6
6
  #include "ggml-impl.h"
7
+ #include "simd-mappings.h"
7
8
 
8
9
  #ifdef __cplusplus
9
10
 
@@ -12,11 +13,11 @@
12
13
  // convenience functions/macros for use in template calls
13
14
  // note: these won't be required after the 'traits' lookup table is used.
14
15
  static inline ggml_fp16_t f32_to_f16(float x) {
15
- return GGML_FP32_TO_FP16(x);
16
+ return GGML_CPU_FP32_TO_FP16(x);
16
17
  }
17
18
 
18
19
  static inline float f16_to_f32(ggml_fp16_t x) {
19
- return GGML_FP16_TO_FP32(x);
20
+ return GGML_CPU_FP16_TO_FP32(x);
20
21
  }
21
22
 
22
23
  static inline ggml_bf16_t f32_to_bf16(float x) {
@@ -62,11 +62,17 @@ struct ggml_compute_params {
62
62
  #if defined(__s390x__) && defined(__VEC__)
63
63
  #ifndef __VXE__
64
64
  #define __VXE__
65
- #endif
65
+ #endif // __VXE__
66
66
  #ifndef __VXE2__
67
67
  #define __VXE2__
68
- #endif
69
- #endif
68
+ #endif // __VXE2__
69
+ #endif // __s390x__ && __VEC__
70
+
71
+ #if defined(__s390x__) && defined(GGML_NNPA)
72
+ #ifndef __NNPA__
73
+ #define __NNPA__
74
+ #endif // __NNPA__
75
+ #endif // __s390x__ && GGML_NNPA
70
76
 
71
77
  #if defined(__ARM_FEATURE_SVE)
72
78
  #include <sys/prctl.h>
@@ -72,15 +72,13 @@
72
72
  #define UNUSED GGML_UNUSED
73
73
  #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
74
74
 
75
+ // precomputed f32 table for f16 (256 KB) (simd-mappings.h)
76
+ float ggml_table_f32_f16[1 << 16];
77
+
75
78
  #if defined(__ARM_ARCH)
76
79
  struct ggml_arm_arch_features_type {
77
- int has_neon;
78
- int has_dotprod;
79
- int has_i8mm;
80
- int has_sve;
81
80
  int sve_cnt;
82
- int has_sme;
83
- } ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
81
+ } ggml_arm_arch_features = { 0 };
84
82
  #endif
85
83
 
86
84
 
@@ -197,6 +195,7 @@ typedef pthread_t ggml_thread_t;
197
195
 
198
196
  static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
199
197
  [GGML_TYPE_F32] = {
198
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
200
199
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
201
200
  .vec_dot_type = GGML_TYPE_F32,
202
201
  .nrows = 1,
@@ -678,87 +677,15 @@ bool ggml_is_numa(void) {
678
677
 
679
678
  #if defined(__linux__) && defined(__aarch64__)
680
679
  #include <sys/auxv.h>
681
- #elif defined(__APPLE__)
682
- #include <sys/sysctl.h>
683
- #endif
684
-
685
- #if !defined(HWCAP2_I8MM)
686
- #define HWCAP2_I8MM (1 << 13)
687
- #endif
688
-
689
- #if !defined(HWCAP2_SME)
690
- #define HWCAP2_SME (1 << 23)
691
680
  #endif
692
681
 
693
682
  static void ggml_init_arm_arch_features(void) {
694
- #if defined(__linux__) && defined(__aarch64__)
695
- uint32_t hwcap = getauxval(AT_HWCAP);
696
- uint32_t hwcap2 = getauxval(AT_HWCAP2);
697
-
698
- ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
699
- ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
700
- ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
701
- ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
702
- ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
703
-
704
- #if defined(__ARM_FEATURE_SVE)
683
+ #if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
705
684
  ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
706
685
  #endif
707
- #elif defined(__APPLE__)
708
- int oldp = 0;
709
- size_t size = sizeof(oldp);
710
- if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
711
- oldp = 0;
712
- }
713
- ggml_arm_arch_features.has_neon = oldp;
714
-
715
- if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
716
- oldp = 0;
717
- }
718
- ggml_arm_arch_features.has_dotprod = oldp;
719
-
720
- if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
721
- oldp = 0;
722
- }
723
- ggml_arm_arch_features.has_i8mm = oldp;
724
-
725
- if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
726
- oldp = 0;
727
- }
728
- ggml_arm_arch_features.has_sme = oldp;
729
-
730
- ggml_arm_arch_features.has_sve = 0;
731
- ggml_arm_arch_features.sve_cnt = 0;
732
- #else
733
- // Run-time CPU feature detection not implemented for this platform, fallback to compile time
734
- #if defined(__ARM_NEON)
735
- ggml_arm_arch_features.has_neon = 1;
736
- #else
737
- ggml_arm_arch_features.has_neon = 0;
738
- #endif
739
-
740
- #if defined(__ARM_FEATURE_MATMUL_INT8)
741
- ggml_arm_arch_features.has_i8mm = 1;
742
- #else
743
- ggml_arm_arch_features.has_i8mm = 0;
744
- #endif
745
-
746
- #if defined(__ARM_FEATURE_SVE)
747
- ggml_arm_arch_features.has_sve = 1;
748
- ggml_arm_arch_features.sve_cnt = 16;
749
- #else
750
- ggml_arm_arch_features.has_sve = 0;
751
- ggml_arm_arch_features.sve_cnt = 0;
752
- #endif
753
-
754
- #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
755
- ggml_arm_arch_features.has_sme = 1;
756
- #else
757
- ggml_arm_arch_features.has_sme = 0;
758
- #endif
759
- #endif
760
686
  }
761
- #endif
687
+
688
+ #endif // __ARM_ARCH
762
689
 
763
690
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
764
691
  GGML_ASSERT(!ggml_get_no_alloc(ctx));
@@ -813,7 +740,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
813
740
  {
814
741
  assert(tensor->nb[0] == sizeof(ggml_fp16_t));
815
742
  for (int i = 0; i < n; i++) {
816
- ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
743
+ ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
817
744
  }
818
745
  } break;
819
746
  case GGML_TYPE_BF16:
@@ -872,7 +799,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
872
799
  {
873
800
  assert(tensor->nb[0] == sizeof(ggml_fp16_t));
874
801
  for (int i = 0; i < n; i++) {
875
- ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
802
+ ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
876
803
  }
877
804
  } break;
878
805
  case GGML_TYPE_BF16:
@@ -923,7 +850,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
923
850
  case GGML_TYPE_F16:
924
851
  {
925
852
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
926
- return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
853
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
927
854
  }
928
855
  case GGML_TYPE_BF16:
929
856
  {
@@ -968,7 +895,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
968
895
  case GGML_TYPE_F16:
969
896
  {
970
897
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
971
- ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
898
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
972
899
  } break;
973
900
  case GGML_TYPE_BF16:
974
901
  {
@@ -997,7 +924,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
997
924
  case GGML_TYPE_I32:
998
925
  return ((int32_t *) data)[0];
999
926
  case GGML_TYPE_F16:
1000
- return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
927
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
1001
928
  case GGML_TYPE_BF16:
1002
929
  return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
1003
930
  case GGML_TYPE_F32:
@@ -1024,7 +951,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
1024
951
  } break;
1025
952
  case GGML_TYPE_F16:
1026
953
  {
1027
- ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
954
+ ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
1028
955
  } break;
1029
956
  case GGML_TYPE_BF16:
1030
957
  {
@@ -1062,7 +989,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
1062
989
  }
1063
990
  case GGML_TYPE_F16:
1064
991
  {
1065
- return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
992
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
1066
993
  }
1067
994
  case GGML_TYPE_BF16:
1068
995
  {
@@ -1101,7 +1028,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
1101
1028
  } break;
1102
1029
  case GGML_TYPE_F16:
1103
1030
  {
1104
- ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
1031
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
1105
1032
  } break;
1106
1033
  case GGML_TYPE_BF16:
1107
1034
  {
@@ -1128,7 +1055,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
1128
1055
  case GGML_TYPE_I32:
1129
1056
  return ((int32_t *) data)[0];
1130
1057
  case GGML_TYPE_F16:
1131
- return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
1058
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
1132
1059
  case GGML_TYPE_BF16:
1133
1060
  return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
1134
1061
  case GGML_TYPE_F32:
@@ -1155,7 +1082,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
1155
1082
  } break;
1156
1083
  case GGML_TYPE_F16:
1157
1084
  {
1158
- ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
1085
+ ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
1159
1086
  } break;
1160
1087
  case GGML_TYPE_BF16:
1161
1088
  {
@@ -1891,6 +1818,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1891
1818
  {
1892
1819
  ggml_compute_forward_get_rows_back(params, tensor);
1893
1820
  } break;
1821
+ case GGML_OP_SET_ROWS:
1822
+ {
1823
+ ggml_compute_forward_set_rows(params, tensor);
1824
+ } break;
1894
1825
  case GGML_OP_DIAG:
1895
1826
  {
1896
1827
  ggml_compute_forward_diag(params, tensor);
@@ -1967,6 +1898,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1967
1898
  {
1968
1899
  ggml_compute_forward_pad_reflect_1d(params, tensor);
1969
1900
  } break;
1901
+ case GGML_OP_ROLL:
1902
+ {
1903
+ ggml_compute_forward_roll(params, tensor);
1904
+ } break;
1970
1905
  case GGML_OP_ARANGE:
1971
1906
  {
1972
1907
  ggml_compute_forward_arange(params, tensor);
@@ -2240,6 +2175,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2240
2175
  n_tasks = n_threads;
2241
2176
  } break;
2242
2177
  case GGML_OP_GET_ROWS:
2178
+ case GGML_OP_SET_ROWS:
2243
2179
  {
2244
2180
  // FIXME: get_rows can use additional threads, but the cost of launching additional threads
2245
2181
  // decreases performance with GPU offloading
@@ -2291,6 +2227,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2291
2227
  case GGML_OP_UPSCALE:
2292
2228
  case GGML_OP_PAD:
2293
2229
  case GGML_OP_PAD_REFLECT_1D:
2230
+ case GGML_OP_ROLL:
2294
2231
  case GGML_OP_ARANGE:
2295
2232
  case GGML_OP_TIMESTEP_EMBEDDING:
2296
2233
  case GGML_OP_ARGSORT:
@@ -3193,6 +3130,10 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
3193
3130
  return ggml_graph_compute(cgraph, &cplan);
3194
3131
  }
3195
3132
 
3133
+ void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
3134
+ memcpy(y, x, n * sizeof(float));
3135
+ }
3136
+
3196
3137
  void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3197
3138
  int64_t i = 0;
3198
3139
  #if defined(__F16C__)
@@ -3213,9 +3154,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3213
3154
  __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3214
3155
  _mm_storel_epi64((__m128i *)(y + i), y_vec);
3215
3156
  }
3157
+ #elif defined(__NNPA__)
3158
+ for (; i + 7 < n; i += 8) {
3159
+ float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3160
+ float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3161
+ uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3162
+ uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3163
+ vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3164
+ }
3165
+ for (; i + 3 < n; i += 4) {
3166
+ float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3167
+ float32x4_t v_zero = vec_splats(0.0f);
3168
+ uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3169
+ uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3170
+ vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3171
+ }
3216
3172
  #endif
3217
3173
  for (; i < n; ++i) {
3218
- y[i] = GGML_FP32_TO_FP16(x[i]);
3174
+ y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
3219
3175
  }
3220
3176
  }
3221
3177
 
@@ -3239,9 +3195,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3239
3195
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3240
3196
  _mm_storeu_ps(y + i, y_vec);
3241
3197
  }
3198
+ #elif defined(__NNPA__)
3199
+ for (; i + 7 < n; i += 8) {
3200
+ uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3201
+ uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3202
+ float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3203
+ float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3204
+ vec_xst(v_yh, 0, (float *)(y + i + 0));
3205
+ vec_xst(v_yl, 0, (float *)(y + i + 4));
3206
+ }
3207
+ for (; i + 3 < n; i += 4) {
3208
+ uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3209
+ uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3210
+ float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3211
+ vec_xst(v_yh, 0, (float *)(y + i));
3212
+ }
3242
3213
  #endif
3214
+
3243
3215
  for (; i < n; ++i) {
3244
- y[i] = GGML_FP16_TO_FP32(x[i]);
3216
+ y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
3245
3217
  }
3246
3218
  }
3247
3219
 
@@ -3441,9 +3413,17 @@ int ggml_cpu_has_vxe(void) {
3441
3413
  #endif
3442
3414
  }
3443
3415
 
3416
+ int ggml_cpu_has_nnpa(void) {
3417
+ #if defined(GGML_NNPA)
3418
+ return 1;
3419
+ #else
3420
+ return 0;
3421
+ #endif
3422
+ }
3423
+
3444
3424
  int ggml_cpu_has_neon(void) {
3445
3425
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3446
- return ggml_arm_arch_features.has_neon;
3426
+ return 1;
3447
3427
  #else
3448
3428
  return 0;
3449
3429
  #endif
@@ -3451,7 +3431,7 @@ int ggml_cpu_has_neon(void) {
3451
3431
 
3452
3432
  int ggml_cpu_has_dotprod(void) {
3453
3433
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
3454
- return ggml_arm_arch_features.has_dotprod;
3434
+ return 1;
3455
3435
  #else
3456
3436
  return 0;
3457
3437
  #endif
@@ -3459,7 +3439,7 @@ int ggml_cpu_has_dotprod(void) {
3459
3439
 
3460
3440
  int ggml_cpu_has_sve(void) {
3461
3441
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
3462
- return ggml_arm_arch_features.has_sve;
3442
+ return 1;
3463
3443
  #else
3464
3444
  return 0;
3465
3445
  #endif
@@ -3467,7 +3447,7 @@ int ggml_cpu_has_sve(void) {
3467
3447
 
3468
3448
  int ggml_cpu_has_matmul_int8(void) {
3469
3449
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
3470
- return ggml_arm_arch_features.has_i8mm;
3450
+ return 1;
3471
3451
  #else
3472
3452
  return 0;
3473
3453
  #endif
@@ -3483,14 +3463,14 @@ int ggml_cpu_get_sve_cnt(void) {
3483
3463
 
3484
3464
  int ggml_cpu_has_sme(void) {
3485
3465
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
3486
- return ggml_arm_arch_features.has_sme;
3466
+ return 1;
3487
3467
  #else
3488
3468
  return 0;
3489
3469
  #endif
3490
3470
  }
3491
3471
 
3492
3472
  void ggml_cpu_init(void) {
3493
- // needed to initialize f16 tables
3473
+ // needed to initialize ggml_time
3494
3474
  {
3495
3475
  struct ggml_init_params params = { 0, NULL, false };
3496
3476
  struct ggml_context * ctx = ggml_init(params);
@@ -3511,9 +3491,10 @@ void ggml_cpu_init(void) {
3511
3491
  uint16_t u16;
3512
3492
  ggml_fp16_t fp16;
3513
3493
  } u = {i};
3514
- float f = GGML_FP16_TO_FP32(u.fp16);
3515
- ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
3516
- ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
3494
+ float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
3495
+ ggml_table_f32_f16[i] = f;
3496
+ ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
3497
+ ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
3517
3498
  }
3518
3499
 
3519
3500
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -416,6 +416,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
416
416
 
417
417
  switch (op->op) {
418
418
  case GGML_OP_CPY:
419
+ case GGML_OP_SET_ROWS:
419
420
  return
420
421
  op->type != GGML_TYPE_IQ3_XXS &&
421
422
  op->type != GGML_TYPE_IQ3_S &&
@@ -578,6 +579,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
578
579
  if (ggml_cpu_has_vxe()) {
579
580
  features.push_back({ "VXE", "1" });
580
581
  }
582
+ if (ggml_cpu_has_nnpa()) {
583
+ features.push_back({ "NNPA", "1" });
584
+ }
581
585
  if (ggml_cpu_has_wasm_simd()) {
582
586
  features.push_back({ "WASM_SIMD", "1" });
583
587
  }
@@ -52,6 +52,7 @@
52
52
  #include "ggml-impl.h"
53
53
  #include "ggml-cpu-impl.h"
54
54
  #include "ggml-quants.h"
55
+ #include "simd-mappings.h"
55
56
 
56
57
  #include <array>
57
58
  #include <type_traits>
@@ -73,7 +74,7 @@
73
74
  namespace {
74
75
 
75
76
  inline float unhalf(ggml_fp16_t d) {
76
- return GGML_FP16_TO_FP32(d);
77
+ return GGML_CPU_FP16_TO_FP32(d);
77
78
  }
78
79
 
79
80
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -252,7 +253,7 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) {
252
253
  float tmp[4];
253
254
 
254
255
  for (int i = 0; i < 4; i++) {
255
- tmp[i] = GGML_FP16_TO_FP32(p[i]);
256
+ tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]);
256
257
  }
257
258
 
258
259
  return vec_xl(0, (const float *)(tmp));