whisper.rn 0.5.0-rc.9 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/cpp/ggml-alloc.c +265 -141
  4. package/cpp/ggml-backend-impl.h +4 -1
  5. package/cpp/ggml-backend-reg.cpp +30 -13
  6. package/cpp/ggml-backend.cpp +221 -38
  7. package/cpp/ggml-backend.h +17 -1
  8. package/cpp/ggml-common.h +17 -0
  9. package/cpp/ggml-cpu/amx/amx.cpp +4 -2
  10. package/cpp/ggml-cpu/arch/arm/quants.c +132 -596
  11. package/cpp/ggml-cpu/arch/arm/repack.cpp +14 -286
  12. package/cpp/ggml-cpu/arch/x86/quants.c +184 -675
  13. package/cpp/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  14. package/cpp/ggml-cpu/arch-fallback.h +32 -2
  15. package/cpp/ggml-cpu/common.h +14 -0
  16. package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
  17. package/cpp/ggml-cpu/ggml-cpu.c +70 -42
  18. package/cpp/ggml-cpu/ggml-cpu.cpp +35 -28
  19. package/cpp/ggml-cpu/ops.cpp +1587 -1177
  20. package/cpp/ggml-cpu/ops.h +5 -8
  21. package/cpp/ggml-cpu/quants.c +35 -0
  22. package/cpp/ggml-cpu/quants.h +8 -0
  23. package/cpp/ggml-cpu/repack.cpp +458 -47
  24. package/cpp/ggml-cpu/repack.h +22 -0
  25. package/cpp/ggml-cpu/simd-mappings.h +89 -60
  26. package/cpp/ggml-cpu/traits.cpp +2 -2
  27. package/cpp/ggml-cpu/traits.h +1 -1
  28. package/cpp/ggml-cpu/vec.cpp +170 -26
  29. package/cpp/ggml-cpu/vec.h +506 -63
  30. package/cpp/ggml-cpu.h +1 -1
  31. package/cpp/ggml-impl.h +119 -9
  32. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  33. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  34. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  35. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  36. package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
  37. package/cpp/ggml-metal/ggml-metal-device.h +226 -0
  38. package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
  39. package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
  40. package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
  41. package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
  42. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  43. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  44. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  45. package/cpp/ggml-metal-impl.h +90 -51
  46. package/cpp/ggml-metal.h +1 -6
  47. package/cpp/ggml-opt.cpp +97 -41
  48. package/cpp/ggml-opt.h +25 -6
  49. package/cpp/ggml-quants.c +111 -16
  50. package/cpp/ggml-quants.h +6 -0
  51. package/cpp/ggml.c +486 -98
  52. package/cpp/ggml.h +221 -16
  53. package/cpp/gguf.cpp +8 -1
  54. package/cpp/jsi/RNWhisperJSI.cpp +25 -6
  55. package/cpp/jsi/ThreadPool.h +3 -3
  56. package/cpp/whisper.cpp +100 -76
  57. package/cpp/whisper.h +1 -0
  58. package/ios/CMakeLists.txt +6 -1
  59. package/ios/RNWhisper.mm +6 -6
  60. package/ios/RNWhisperContext.mm +2 -0
  61. package/ios/RNWhisperVadContext.mm +16 -13
  62. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  63. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  64. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
  65. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  66. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +119 -9
  67. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
  68. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  69. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  70. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  71. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +221 -16
  72. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  73. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  74. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  75. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  76. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  77. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  78. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
  79. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  80. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +119 -9
  81. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
  82. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  83. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  84. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  85. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +221 -16
  86. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  87. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  88. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  89. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  90. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  91. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  92. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  93. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
  94. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  95. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +119 -9
  96. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
  97. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  98. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  99. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  100. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +221 -16
  101. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  102. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  103. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  104. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  105. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  106. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  107. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
  108. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  109. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +119 -9
  110. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
  111. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  112. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  113. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  114. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +221 -16
  115. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  116. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  117. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  118. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  119. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  120. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +13 -0
  121. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -1
  122. package/lib/commonjs/version.json +1 -1
  123. package/lib/module/realtime-transcription/RealtimeTranscriber.js +13 -0
  124. package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -1
  125. package/lib/module/version.json +1 -1
  126. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -1
  127. package/lib/typescript/realtime-transcription/types.d.ts +6 -0
  128. package/lib/typescript/realtime-transcription/types.d.ts.map +1 -1
  129. package/package.json +1 -1
  130. package/src/realtime-transcription/RealtimeTranscriber.ts +17 -0
  131. package/src/realtime-transcription/types.ts +6 -0
  132. package/src/version.json +1 -1
  133. package/whisper-rn.podspec +8 -9
  134. package/cpp/ggml-metal.m +0 -6284
  135. package/cpp/ggml-whisper-sim.metallib +0 -0
  136. package/cpp/ggml-whisper.metallib +0 -0
@@ -13,6 +13,7 @@
13
13
  #define wsp_ggml_vec_dot_q5_0_q8_0_generic wsp_ggml_vec_dot_q5_0_q8_0
14
14
  #define wsp_ggml_vec_dot_q5_1_q8_1_generic wsp_ggml_vec_dot_q5_1_q8_1
15
15
  #define wsp_ggml_vec_dot_q8_0_q8_0_generic wsp_ggml_vec_dot_q8_0_q8_0
16
+ #define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
16
17
  #define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
17
18
  #define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
18
19
  #define wsp_ggml_vec_dot_q2_K_q8_K_generic wsp_ggml_vec_dot_q2_K_q8_K
@@ -37,17 +38,25 @@
37
38
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
38
39
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
39
40
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
41
+ #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
40
42
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
43
+ #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
41
44
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
42
45
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
43
46
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
44
47
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
48
+ #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
45
49
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
50
+ #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
46
51
  #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
47
52
  // repack.cpp
48
53
  #define wsp_ggml_wsp_quantize_mat_q8_K_4x8_generic wsp_ggml_wsp_quantize_mat_q8_K_4x8
49
54
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
55
+ #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
56
+ #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
50
57
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
58
+ #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
59
+ #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
51
60
  #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
52
61
  // repack.cpp
53
62
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
@@ -72,18 +81,23 @@
72
81
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
73
82
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
74
83
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
84
+ #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
75
85
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
86
+ #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
76
87
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
77
88
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
78
89
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
79
90
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
91
+ #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
80
92
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
93
+ #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
81
94
  #elif defined(__loongarch64)
82
95
  // quants.c
83
96
  #define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
84
97
  #define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
85
98
  #define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
86
99
  #define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
100
+ #define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
87
101
  // repack.cpp
88
102
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
89
103
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
@@ -92,12 +106,16 @@
92
106
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
93
107
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
94
108
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
109
+ #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
95
110
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
111
+ #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
96
112
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
97
113
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
98
114
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
99
115
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
116
+ #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
100
117
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
118
+ #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
101
119
  #elif defined(__riscv)
102
120
  // quants.c
103
121
  #define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
@@ -112,6 +130,7 @@
112
130
  #define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
113
131
  #define wsp_ggml_vec_dot_iq4_nl_q8_0_generic wsp_ggml_vec_dot_iq4_nl_q8_0
114
132
  #define wsp_ggml_vec_dot_iq4_xs_q8_K_generic wsp_ggml_vec_dot_iq4_xs_q8_K
133
+ #define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
115
134
  // repack.cpp
116
135
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
117
136
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
@@ -119,16 +138,18 @@
119
138
  #define wsp_ggml_gemv_q4_0_4x4_q8_0_generic wsp_ggml_gemv_q4_0_4x4_q8_0
120
139
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
121
140
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
141
+ #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
122
142
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
143
+ #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
123
144
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
124
145
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
125
146
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
147
+ #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
126
148
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
149
+ #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
127
150
  #elif defined(__s390x__)
128
151
  // quants.c
129
152
  #define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
130
- #define wsp_ggml_vec_dot_q5_0_q8_0_generic wsp_ggml_vec_dot_q5_0_q8_0
131
- #define wsp_ggml_vec_dot_q5_1_q8_1_generic wsp_ggml_vec_dot_q5_1_q8_1
132
153
  #define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
133
154
  #define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
134
155
  #define wsp_ggml_vec_dot_q2_K_q8_K_generic wsp_ggml_vec_dot_q2_K_q8_K
@@ -147,12 +168,16 @@
147
168
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
148
169
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
149
170
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
171
+ #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
150
172
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
173
+ #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
151
174
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
152
175
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
153
176
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
154
177
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
178
+ #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
155
179
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
180
+ #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
156
181
  #elif defined(__wasm__)
157
182
  // quants.c
158
183
  #define wsp_ggml_vec_dot_q4_1_q8_1_generic wsp_ggml_vec_dot_q4_1_q8_1
@@ -167,6 +192,7 @@
167
192
  #define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
168
193
  #define wsp_ggml_vec_dot_iq4_nl_q8_0_generic wsp_ggml_vec_dot_iq4_nl_q8_0
169
194
  #define wsp_ggml_vec_dot_iq4_xs_q8_K_generic wsp_ggml_vec_dot_iq4_xs_q8_K
195
+ #define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
170
196
  // repack.cpp
171
197
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
172
198
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
@@ -175,10 +201,14 @@
175
201
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
176
202
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
177
203
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
204
+ #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
178
205
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
206
+ #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
179
207
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
180
208
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
181
209
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
182
210
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
211
+ #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
183
212
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
213
+ #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
184
214
  #endif
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(wsp_ggml_bf16_t x) {
28
28
  return WSP_GGML_BF16_TO_FP32(x);
29
29
  }
30
30
 
31
+ static inline float i32_to_f32(int32_t x) {
32
+ return x;
33
+ }
34
+
35
+ static inline int32_t f32_to_i32(float x) {
36
+ return x;
37
+ }
38
+
31
39
  static inline float f32_to_f32(float x) {
32
40
  return x;
33
41
  }
@@ -54,6 +62,12 @@ struct type_conversion_table<wsp_ggml_bf16_t> {
54
62
  static constexpr wsp_ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
55
63
  };
56
64
 
65
+ template <>
66
+ struct type_conversion_table<int32_t> {
67
+ static constexpr float (*to_f32)(int32_t) = i32_to_f32;
68
+ static constexpr int32_t (*from_f32)(float) = f32_to_i32;
69
+ };
70
+
57
71
  static std::pair<int64_t, int64_t> get_thread_range(const struct wsp_ggml_compute_params * params, const struct wsp_ggml_tensor * src0) {
58
72
  const int64_t ith = params->ith;
59
73
  const int64_t nth = params->nth;
@@ -68,12 +68,6 @@ struct wsp_ggml_compute_params {
68
68
  #endif // __VXE2__
69
69
  #endif // __s390x__ && __VEC__
70
70
 
71
- #if defined(__s390x__) && defined(WSP_GGML_NNPA)
72
- #ifndef __NNPA__
73
- #define __NNPA__
74
- #endif // __NNPA__
75
- #endif // __s390x__ && WSP_GGML_NNPA
76
-
77
71
  #if defined(__ARM_FEATURE_SVE)
78
72
  #include <sys/prctl.h>
79
73
  #endif
@@ -486,6 +480,19 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
486
480
  return v_abo + v_abe;
487
481
  }
488
482
 
483
+ /**
484
+ * @see https://github.com/ggml-org/llama.cpp/pull/14037
485
+ */
486
+ inline static float vec_hsum_f32x4(float32x4_t v) {
487
+ float32x4_t v_temp = v + vec_reve(v);
488
+ return v_temp[0] + v_temp[1];
489
+ }
490
+
491
+ inline static int32_t vec_hsum_i32x4(int32x4_t v) {
492
+ int32x4_t v_temp = v + vec_reve(v);
493
+ return v_temp[0] + v_temp[1];
494
+ }
495
+
489
496
  inline static int32x4_t wsp_ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
490
497
  const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
491
498
  return acc + (vec_unpackh(p) + vec_unpackl(p));
@@ -253,6 +253,12 @@ static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT
253
253
  .vec_dot_type = WSP_GGML_TYPE_Q8_1,
254
254
  .nrows = 1,
255
255
  },
256
+ [WSP_GGML_TYPE_MXFP4] = {
257
+ .from_float = wsp_quantize_row_mxfp4,
258
+ .vec_dot = wsp_ggml_vec_dot_mxfp4_q8_0,
259
+ .vec_dot_type = WSP_GGML_TYPE_Q8_0,
260
+ .nrows = 1,
261
+ },
256
262
  [WSP_GGML_TYPE_Q2_K] = {
257
263
  .from_float = wsp_quantize_row_q2_K,
258
264
  .vec_dot = wsp_ggml_vec_dot_q2_K_q8_K,
@@ -367,6 +373,9 @@ static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT
367
373
  .vec_dot_type = WSP_GGML_TYPE_Q8_K,
368
374
  .nrows = 1,
369
375
  },
376
+ [WSP_GGML_TYPE_I32] = {
377
+ .from_float = (wsp_ggml_from_float_t) wsp_ggml_cpu_fp32_to_i32,
378
+ },
370
379
  };
371
380
 
372
381
  const struct wsp_ggml_type_traits_cpu * wsp_ggml_get_type_traits_cpu(enum wsp_ggml_type type) {
@@ -464,10 +473,10 @@ struct wsp_ggml_threadpool {
464
473
  struct wsp_ggml_compute_state {
465
474
  #ifndef WSP_GGML_USE_OPENMP
466
475
  wsp_ggml_thread_t thrd;
467
- bool cpumask[WSP_GGML_MAX_N_THREADS];
468
476
  int last_graph;
469
477
  bool pending;
470
478
  #endif
479
+ bool cpumask[WSP_GGML_MAX_N_THREADS];
471
480
  struct wsp_ggml_threadpool * threadpool;
472
481
  int ith;
473
482
  };
@@ -1670,6 +1679,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
1670
1679
  {
1671
1680
  wsp_ggml_compute_forward_add(params, tensor);
1672
1681
  } break;
1682
+ case WSP_GGML_OP_ADD_ID:
1683
+ {
1684
+ wsp_ggml_compute_forward_add_id(params, tensor);
1685
+ } break;
1673
1686
  case WSP_GGML_OP_ADD1:
1674
1687
  {
1675
1688
  wsp_ggml_compute_forward_add1(params, tensor);
@@ -1866,10 +1879,18 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
1866
1879
  {
1867
1880
  wsp_ggml_compute_forward_im2col_back_f32(params, tensor);
1868
1881
  } break;
1882
+ case WSP_GGML_OP_IM2COL_3D:
1883
+ {
1884
+ wsp_ggml_compute_forward_im2col_3d(params, tensor);
1885
+ } break;
1869
1886
  case WSP_GGML_OP_CONV_2D:
1870
1887
  {
1871
1888
  wsp_ggml_compute_forward_conv_2d(params, tensor);
1872
1889
  } break;
1890
+ case WSP_GGML_OP_CONV_3D:
1891
+ {
1892
+ wsp_ggml_compute_forward_conv_3d(params, tensor);
1893
+ } break;
1873
1894
  case WSP_GGML_OP_CONV_2D_DW:
1874
1895
  {
1875
1896
  wsp_ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -1924,7 +1945,7 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
1924
1945
  } break;
1925
1946
  case WSP_GGML_OP_FLASH_ATTN_EXT:
1926
1947
  {
1927
- wsp_ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
1948
+ wsp_ggml_compute_forward_flash_attn_ext(params, tensor);
1928
1949
  } break;
1929
1950
  case WSP_GGML_OP_FLASH_ATTN_BACK:
1930
1951
  {
@@ -2012,6 +2033,11 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
2012
2033
  wsp_ggml_compute_forward_opt_step_adamw(params, tensor);
2013
2034
  }
2014
2035
  break;
2036
+ case WSP_GGML_OP_OPT_STEP_SGD:
2037
+ {
2038
+ wsp_ggml_compute_forward_opt_step_sgd(params, tensor);
2039
+ }
2040
+ break;
2015
2041
  case WSP_GGML_OP_NONE:
2016
2042
  {
2017
2043
  // nop
@@ -2111,6 +2137,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2111
2137
  case WSP_GGML_OP_DUP:
2112
2138
  case WSP_GGML_OP_CONT:
2113
2139
  case WSP_GGML_OP_ADD:
2140
+ case WSP_GGML_OP_ADD_ID:
2114
2141
  case WSP_GGML_OP_ADD1:
2115
2142
  case WSP_GGML_OP_ACC:
2116
2143
  {
@@ -2172,6 +2199,9 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2172
2199
  case WSP_GGML_GLU_OP_REGLU:
2173
2200
  case WSP_GGML_GLU_OP_GEGLU:
2174
2201
  case WSP_GGML_GLU_OP_SWIGLU:
2202
+ case WSP_GGML_GLU_OP_SWIGLU_OAI:
2203
+ case WSP_GGML_GLU_OP_GEGLU_ERF:
2204
+ case WSP_GGML_GLU_OP_GEGLU_QUICK:
2175
2205
  {
2176
2206
  n_tasks = n_threads;
2177
2207
  } break;
@@ -2232,7 +2262,9 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2232
2262
  } break;
2233
2263
  case WSP_GGML_OP_IM2COL:
2234
2264
  case WSP_GGML_OP_IM2COL_BACK:
2265
+ case WSP_GGML_OP_IM2COL_3D:
2235
2266
  case WSP_GGML_OP_CONV_2D:
2267
+ case WSP_GGML_OP_CONV_3D:
2236
2268
  case WSP_GGML_OP_CONV_2D_DW:
2237
2269
  case WSP_GGML_OP_CONV_TRANSPOSE_1D:
2238
2270
  case WSP_GGML_OP_CONV_TRANSPOSE_2D:
@@ -2311,6 +2343,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2311
2343
  case WSP_GGML_OP_CROSS_ENTROPY_LOSS:
2312
2344
  case WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK:
2313
2345
  case WSP_GGML_OP_OPT_STEP_ADAMW:
2346
+ case WSP_GGML_OP_OPT_STEP_SGD:
2314
2347
  {
2315
2348
  n_tasks = n_threads;
2316
2349
  } break;
@@ -2666,11 +2699,15 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
2666
2699
  if (wsp_ggml_is_quantized(node->type) ||
2667
2700
  // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
2668
2701
  (node->src[0]->type == WSP_GGML_TYPE_F16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_BF16) ||
2669
- (node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16)) {
2702
+ (node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16) ||
2703
+ // conversion between F32 and I32
2704
+ (node->src[0]->type == WSP_GGML_TYPE_F32 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_I32) ||
2705
+ (node->src[0]->type == WSP_GGML_TYPE_I32 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F32)) {
2670
2706
  cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
2671
2707
  }
2672
2708
  } break;
2673
2709
  case WSP_GGML_OP_ADD:
2710
+ case WSP_GGML_OP_ADD_ID:
2674
2711
  case WSP_GGML_OP_ADD1:
2675
2712
  {
2676
2713
  if (wsp_ggml_is_quantized(node->src[0]->type)) {
@@ -2752,6 +2789,7 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
2752
2789
  }
2753
2790
  } break;
2754
2791
  case WSP_GGML_OP_CONV_2D:
2792
+ case WSP_GGML_OP_CONV_3D:
2755
2793
  {
2756
2794
  cur = WSP_GGML_IM2COL_WORK_SIZE;
2757
2795
  } break;
@@ -3043,7 +3081,14 @@ static struct wsp_ggml_threadpool * wsp_ggml_threadpool_new_impl(
3043
3081
 
3044
3082
  threadpool->workers = workers;
3045
3083
 
3046
- #ifndef WSP_GGML_USE_OPENMP
3084
+ #ifdef WSP_GGML_USE_OPENMP
3085
+ int32_t cpumask_iter = 0;
3086
+
3087
+ // Compute CPU masks for each thread
3088
+ for (int j = 0; j < tpp->n_threads; j++) {
3089
+ wsp_ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
3090
+ }
3091
+ #else // WSP_GGML_USE_OPENMP
3047
3092
  wsp_ggml_mutex_init(&threadpool->mutex);
3048
3093
  wsp_ggml_cond_init(&threadpool->cond);
3049
3094
 
@@ -3116,7 +3161,14 @@ enum wsp_ggml_status wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, str
3116
3161
  atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3117
3162
  }
3118
3163
 
3119
- wsp_ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
3164
+ // Apply thread CPU mask and priority
3165
+ int ith = omp_get_thread_num();
3166
+
3167
+ wsp_ggml_thread_apply_priority(threadpool->prio);
3168
+ if (wsp_ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
3169
+ wsp_ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
3170
+ }
3171
+ wsp_ggml_graph_compute_thread(&threadpool->workers[ith]);
3120
3172
  }
3121
3173
  } else {
3122
3174
  atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@@ -3179,20 +3231,12 @@ void wsp_ggml_cpu_fp32_to_fp16(const float * x, wsp_ggml_fp16_t * y, int64_t n)
3179
3231
  __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3180
3232
  _mm_storel_epi64((__m128i *)(y + i), y_vec);
3181
3233
  }
3182
- #elif defined(__NNPA__)
3183
- for (; i + 7 < n; i += 8) {
3184
- float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3185
- float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3186
- uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3187
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3188
- vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
3189
- }
3190
- for (; i + 3 < n; i += 4) {
3191
- float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3192
- float32x4_t v_zero = vec_splats(0.0f);
3193
- uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3194
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3195
- vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
3234
+ #elif defined(__riscv_zvfh)
3235
+ for (int vl; i < n; i += vl) {
3236
+ vl = __riscv_vsetvl_e32m2(n - i);
3237
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
3238
+ vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
3239
+ __riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
3196
3240
  }
3197
3241
  #endif
3198
3242
  for (; i < n; ++i) {
@@ -3220,21 +3264,6 @@ void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t * x, float * y, int64_t n)
3220
3264
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3221
3265
  _mm_storeu_ps(y + i, y_vec);
3222
3266
  }
3223
- #elif defined(__NNPA__)
3224
- for (; i + 7 < n; i += 8) {
3225
- uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
3226
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3227
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3228
- float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3229
- vec_xst(v_yh, 0, (float *)(y + i + 0));
3230
- vec_xst(v_yl, 0, (float *)(y + i + 4));
3231
- }
3232
- for (; i + 3 < n; i += 4) {
3233
- uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
3234
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3235
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3236
- vec_xst(v_yh, 0, (float *)(y + i));
3237
- }
3238
3267
  #endif
3239
3268
 
3240
3269
  for (; i < n; ++i) {
@@ -3249,6 +3278,13 @@ void wsp_ggml_cpu_fp32_to_bf16(const float * x, wsp_ggml_bf16_t * y, int64_t n)
3249
3278
  }
3250
3279
  }
3251
3280
 
3281
+ void wsp_ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
3282
+ int64_t i = 0;
3283
+ for (; i < n; ++i) {
3284
+ y[i] = x[i];
3285
+ }
3286
+ }
3287
+
3252
3288
  void wsp_ggml_cpu_bf16_to_fp32(const wsp_ggml_bf16_t * x, float * y, int64_t n) {
3253
3289
  int64_t i = 0;
3254
3290
  #if defined(__AVX2__)
@@ -3438,14 +3474,6 @@ int wsp_ggml_cpu_has_vxe(void) {
3438
3474
  #endif
3439
3475
  }
3440
3476
 
3441
- int wsp_ggml_cpu_has_nnpa(void) {
3442
- #if defined(WSP_GGML_NNPA)
3443
- return 1;
3444
- #else
3445
- return 0;
3446
- #endif
3447
- }
3448
-
3449
3477
  int wsp_ggml_cpu_has_neon(void) {
3450
3478
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3451
3479
  return 1;
@@ -18,6 +18,10 @@
18
18
  # include "kleidiai/kleidiai.h"
19
19
  #endif
20
20
 
21
+ #ifdef WSP_GGML_USE_CPU_RISCV64_SPACEMIT
22
+ # include "spacemit/ime.h"
23
+ #endif
24
+
21
25
  #if defined(_WIN32)
22
26
  # define WIN32_LEAN_AND_MEAN
23
27
  # ifndef NOMINMAX
@@ -35,7 +39,7 @@
35
39
 
36
40
  // ggml-backend interface
37
41
 
38
- std::vector<wsp_ggml_backend_buffer_type_t>& wsp_ggml_backend_cpu_get_extra_buffers_type() {
42
+ std::vector<wsp_ggml_backend_buffer_type_t> & wsp_ggml_backend_cpu_get_extra_buffer_types() {
39
43
  static std::vector<wsp_ggml_backend_buffer_type_t> bufts = []() {
40
44
  std::vector<wsp_ggml_backend_buffer_type_t> bufts;
41
45
 
@@ -45,6 +49,12 @@ std::vector<wsp_ggml_backend_buffer_type_t>& wsp_ggml_backend_cpu_get_extra_buff
45
49
  }
46
50
  #endif
47
51
 
52
+ #ifdef WSP_GGML_USE_CPU_RISCV64_SPACEMIT
53
+ if (wsp_ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
54
+ bufts.push_back(wsp_ggml_backend_cpu_riscv64_spacemit_buffer_type());
55
+ }
56
+ #endif
57
+
48
58
  #ifdef WSP_GGML_USE_CPU_KLEIDIAI
49
59
  if (wsp_ggml_backend_cpu_kleidiai_buffer_type()) {
50
60
  bufts.push_back(wsp_ggml_backend_cpu_kleidiai_buffer_type());
@@ -57,8 +67,6 @@ std::vector<wsp_ggml_backend_buffer_type_t>& wsp_ggml_backend_cpu_get_extra_buff
57
67
  }
58
68
  #endif
59
69
 
60
- bufts.push_back(NULL);
61
-
62
70
  return bufts;
63
71
  }();
64
72
 
@@ -66,14 +74,20 @@ std::vector<wsp_ggml_backend_buffer_type_t>& wsp_ggml_backend_cpu_get_extra_buff
66
74
  }
67
75
 
68
76
  static wsp_ggml_backend_buffer_type_t * wsp_ggml_backend_cpu_device_get_extra_buffers_type(wsp_ggml_backend_dev_t device) {
69
- return wsp_ggml_backend_cpu_get_extra_buffers_type().data();
77
+ static std::vector<wsp_ggml_backend_buffer_type_t> extra_bufts = [] {
78
+ std::vector<wsp_ggml_backend_buffer_type_t> bufts = wsp_ggml_backend_cpu_get_extra_buffer_types();
79
+ bufts.push_back(nullptr);
80
+ return bufts;
81
+ }();
82
+
83
+ return extra_bufts.data();
70
84
 
71
85
  WSP_GGML_UNUSED(device);
72
86
  }
73
87
 
74
88
  static bool wsp_ggml_backend_cpu_is_extra_buffer_type(wsp_ggml_backend_buffer_type_t buft) {
75
- for (auto * extra : wsp_ggml_backend_cpu_get_extra_buffers_type()) {
76
- if (extra && extra == buft) {
89
+ for (auto * extra : wsp_ggml_backend_cpu_get_extra_buffer_types()) {
90
+ if (extra == buft) {
77
91
  return true;
78
92
  }
79
93
  }
@@ -186,6 +200,7 @@ static const struct wsp_ggml_backend_i wsp_ggml_backend_cpu_i = {
186
200
  /* .graph_compute = */ wsp_ggml_backend_cpu_graph_compute,
187
201
  /* .event_record = */ NULL,
188
202
  /* .event_wait = */ NULL,
203
+ /* .graph_optimize = */ NULL,
189
204
  };
190
205
 
191
206
  static wsp_ggml_guid_t wsp_ggml_backend_cpu_guid(void) {
@@ -210,10 +225,10 @@ wsp_ggml_backend_t wsp_ggml_backend_cpu_init(void) {
210
225
  ctx->abort_callback_data = NULL;
211
226
 
212
227
  wsp_ggml_backend_t cpu_backend = new wsp_ggml_backend {
213
- /* .guid = */ wsp_ggml_backend_cpu_guid(),
214
- /* .interface = */ wsp_ggml_backend_cpu_i,
215
- /* .device = */ wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
216
- /* .context = */ ctx,
228
+ /* .guid = */ wsp_ggml_backend_cpu_guid(),
229
+ /* .iface = */ wsp_ggml_backend_cpu_i,
230
+ /* .device = */ wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
231
+ /* .context = */ ctx,
217
232
  };
218
233
 
219
234
  if (cpu_backend == NULL) {
@@ -344,8 +359,10 @@ static void wsp_ggml_backend_cpu_device_get_memory(wsp_ggml_backend_dev_t dev, s
344
359
  long pages = sysconf(_SC_PHYS_PAGES);
345
360
  long page_size = sysconf(_SC_PAGE_SIZE);
346
361
  *total = pages * page_size;
362
+
363
+ // "free" system memory is ill-defined, for practical purposes assume that all of it is free:
347
364
  *free = *total;
348
- #endif
365
+ #endif // _WIN32
349
366
 
350
367
  WSP_GGML_UNUSED(dev);
351
368
  }
@@ -397,20 +414,13 @@ static bool wsp_ggml_backend_cpu_device_supports_op(wsp_ggml_backend_dev_t dev,
397
414
  return true;
398
415
  }
399
416
 
400
- // extra_buffer_op?
401
- for (auto extra : wsp_ggml_backend_cpu_get_extra_buffers_type()) {
402
- if (extra) {
403
- auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
404
- if (buf_extra && buf_extra->supports_op(dev, op)) {
405
- return true;
406
- }
407
- }
408
- }
409
-
410
- // the other case need host buffer.
411
- for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
412
- if (op->src[i] && op->src[i]->buffer && !wsp_ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
413
- return false;
417
+ // check extra buffer types
418
+ // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
419
+ for (int i = 0; i < 4; i++) {
420
+ if (op->src[i] && op->src[i]->buffer &&
421
+ wsp_ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
422
+ auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
423
+ return buf_extra->supports_op(dev, op);
414
424
  }
415
425
  }
416
426
 
@@ -579,9 +589,6 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
579
589
  if (wsp_ggml_cpu_has_vxe()) {
580
590
  features.push_back({ "VXE", "1" });
581
591
  }
582
- if (wsp_ggml_cpu_has_nnpa()) {
583
- features.push_back({ "NNPA", "1" });
584
- }
585
592
  if (wsp_ggml_cpu_has_wasm_simd()) {
586
593
  features.push_back({ "WASM_SIMD", "1" });
587
594
  }