whisper.rn 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/README.md +1 -1
  2. package/android/src/main/java/com/rnwhisper/WhisperContext.java +5 -0
  3. package/android/src/main/jni.cpp +13 -0
  4. package/cpp/ggml-alloc.c +78 -26
  5. package/cpp/ggml-alloc.h +9 -0
  6. package/cpp/ggml-backend-impl.h +1 -1
  7. package/cpp/ggml-backend-reg.cpp +19 -3
  8. package/cpp/ggml-backend.cpp +72 -20
  9. package/cpp/ggml-backend.h +2 -1
  10. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  11. package/cpp/ggml-cpu/arch/arm/repack.cpp +1004 -0
  12. package/cpp/ggml-cpu/arch/x86/repack.cpp +6 -6
  13. package/cpp/ggml-cpu/arch-fallback.h +50 -2
  14. package/cpp/ggml-cpu/ggml-cpu-impl.h +1 -1
  15. package/cpp/ggml-cpu/ggml-cpu.c +139 -58
  16. package/cpp/ggml-cpu/ggml-cpu.cpp +4 -0
  17. package/cpp/ggml-cpu/ops.cpp +170 -18
  18. package/cpp/ggml-cpu/ops.h +1 -0
  19. package/cpp/ggml-cpu/repack.cpp +531 -5
  20. package/cpp/ggml-cpu/repack.h +14 -0
  21. package/cpp/ggml-cpu/simd-mappings.h +16 -18
  22. package/cpp/ggml-cpu/vec.cpp +41 -1
  23. package/cpp/ggml-cpu/vec.h +241 -138
  24. package/cpp/ggml-cpu.h +1 -0
  25. package/cpp/ggml-impl.h +0 -4
  26. package/cpp/ggml-metal/ggml-metal-context.m +26 -16
  27. package/cpp/ggml-metal/ggml-metal-device.cpp +452 -371
  28. package/cpp/ggml-metal/ggml-metal-device.h +87 -65
  29. package/cpp/ggml-metal/ggml-metal-device.m +263 -104
  30. package/cpp/ggml-metal/ggml-metal-impl.h +58 -4
  31. package/cpp/ggml-metal/ggml-metal-ops.cpp +415 -98
  32. package/cpp/ggml-metal/ggml-metal-ops.h +4 -0
  33. package/cpp/ggml-metal/ggml-metal.cpp +6 -5
  34. package/cpp/ggml-metal/ggml-metal.metal +404 -34
  35. package/cpp/ggml.c +110 -31
  36. package/cpp/ggml.h +51 -12
  37. package/cpp/jsi/RNWhisperJSI.cpp +1 -0
  38. package/cpp/whisper.cpp +17 -4
  39. package/ios/CMakeLists.txt +21 -1
  40. package/ios/RNWhisperContext.mm +5 -0
  41. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-alloc.h +9 -0
  42. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +1 -1
  43. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +2 -1
  44. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -0
  45. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +0 -4
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +51 -12
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  48. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-metal.metal +404 -34
  49. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  50. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +9 -0
  51. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +1 -1
  52. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +2 -1
  53. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -0
  54. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +0 -4
  55. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +51 -12
  56. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  57. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  58. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-metal.metal +404 -34
  59. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  60. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-alloc.h +9 -0
  61. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +1 -1
  62. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +2 -1
  63. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -0
  64. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +0 -4
  65. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +51 -12
  66. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  67. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-metal.metal +404 -34
  68. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  69. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +9 -0
  70. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +1 -1
  71. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +2 -1
  72. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -0
  73. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +0 -4
  74. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +51 -12
  75. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  76. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  77. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-metal.metal +404 -34
  78. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  79. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  80. package/lib/commonjs/jest-mock.js +2 -0
  81. package/lib/commonjs/jest-mock.js.map +1 -1
  82. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +156 -12
  83. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -1
  84. package/lib/commonjs/version.json +1 -1
  85. package/lib/module/NativeRNWhisper.js.map +1 -1
  86. package/lib/module/jest-mock.js +2 -0
  87. package/lib/module/jest-mock.js.map +1 -1
  88. package/lib/module/realtime-transcription/RealtimeTranscriber.js +155 -12
  89. package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -1
  90. package/lib/module/version.json +1 -1
  91. package/lib/typescript/NativeRNWhisper.d.ts +1 -0
  92. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  93. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts +29 -0
  94. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -1
  95. package/lib/typescript/realtime-transcription/types.d.ts +7 -0
  96. package/lib/typescript/realtime-transcription/types.d.ts.map +1 -1
  97. package/package.json +1 -1
  98. package/src/NativeRNWhisper.ts +1 -0
  99. package/src/jest-mock.ts +2 -0
  100. package/src/realtime-transcription/RealtimeTranscriber.ts +179 -9
  101. package/src/realtime-transcription/types.ts +9 -0
  102. package/src/version.json +1 -1
@@ -646,7 +646,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * WSP_GGML_RESTRICT s, siz
646
646
  __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
647
647
  int64_t xstart = 0;
648
648
  int anr = nr - nr%16; // Used to align nr with boundary of 16
649
- #ifdef __AVX512F__
649
+ #if defined(__AVX512BW__) && defined(__AVX512DQ__)
650
650
  int anc = nc - nc%16; // Used to align nc with boundary of 16
651
651
  // Mask to mask out nibbles from packed bytes expanded to 512 bit length
652
652
  const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
@@ -1041,7 +1041,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * WSP_GGML_RESTRICT s, siz
1041
1041
  xstart = anc/8;
1042
1042
  y = 0;
1043
1043
  }
1044
- #endif // __AVX512F__
1044
+ #endif // __AVX512BW__ && __AVX512DQ__
1045
1045
 
1046
1046
  // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
1047
1047
 
@@ -1989,7 +1989,7 @@ void wsp_ggml_gemm_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
1989
1989
  __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
1990
1990
  int64_t xstart = 0;
1991
1991
  int anr = nr - nr % 16;; // Used to align nr with boundary of 16
1992
- #ifdef __AVX512F__
1992
+ #if defined(__AVX512BW__) && defined(__AVX512DQ__)
1993
1993
  int anc = nc - nc % 16; // Used to align nc with boundary of 16
1994
1994
  // Mask to mask out nibbles from packed bytes expanded to 512 bit length
1995
1995
  const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
@@ -2727,7 +2727,7 @@ void wsp_ggml_gemm_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
2727
2727
  xstart = anc/8;
2728
2728
  y = 0;
2729
2729
  }
2730
- #endif //AVX512F
2730
+ #endif // __AVX512BW__ && __AVX512DQ__
2731
2731
 
2732
2732
  // Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
2733
2733
  for (; y < anr / 4; y += 4) {
@@ -3467,7 +3467,7 @@ void wsp_ggml_gemm_q2_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
3467
3467
  __m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse);
3468
3468
  scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0);
3469
3469
 
3470
- #ifdef __AVX512F__
3470
+ #if defined(__AVX512BW__) && defined(__AVX512DQ__)
3471
3471
 
3472
3472
  int anc = nc - nc % 16; // Used to align nc with boundary of 16
3473
3473
 
@@ -4947,7 +4947,7 @@ void wsp_ggml_gemm_q2_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
4947
4947
  y = 0;
4948
4948
  }
4949
4949
 
4950
- #endif //AVX512F
4950
+ #endif // __AVX512BW__ && __AVX512DQ__
4951
4951
 
4952
4952
  // Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
4953
4953
  for (; y < anr / 4; y += 4) {
@@ -33,39 +33,52 @@
33
33
  // repack.cpp
34
34
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
35
35
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
36
+ #define wsp_ggml_wsp_quantize_mat_q8_K_4x4_generic wsp_ggml_wsp_quantize_mat_q8_K_4x4
36
37
  #define wsp_ggml_wsp_quantize_mat_q8_K_4x8_generic wsp_ggml_wsp_quantize_mat_q8_K_4x8
37
38
  #define wsp_ggml_gemv_q4_0_4x4_q8_0_generic wsp_ggml_gemv_q4_0_4x4_q8_0
38
39
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
39
40
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
41
+ #define wsp_ggml_gemv_q4_K_8x4_q8_K_generic wsp_ggml_gemv_q4_K_8x4_q8_K
40
42
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
41
43
  #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
42
44
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
43
45
  #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
46
+ #define wsp_ggml_gemv_q8_0_4x4_q8_0_generic wsp_ggml_gemv_q8_0_4x4_q8_0
47
+ #define wsp_ggml_gemv_q8_0_4x8_q8_0_generic wsp_ggml_gemv_q8_0_4x8_q8_0
44
48
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
45
49
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
46
50
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
51
+ #define wsp_ggml_gemm_q4_K_8x4_q8_K_generic wsp_ggml_gemm_q4_K_8x4_q8_K
47
52
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
48
53
  #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
49
54
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
50
55
  #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
56
+ #define wsp_ggml_gemm_q8_0_4x4_q8_0_generic wsp_ggml_gemm_q8_0_4x4_q8_0
57
+ #define wsp_ggml_gemm_q8_0_4x8_q8_0_generic wsp_ggml_gemm_q8_0_4x8_q8_0
51
58
  #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
52
59
  // repack.cpp
60
+ #define wsp_ggml_wsp_quantize_mat_q8_K_4x4_generic wsp_ggml_wsp_quantize_mat_q8_K_4x4
53
61
  #define wsp_ggml_wsp_quantize_mat_q8_K_4x8_generic wsp_ggml_wsp_quantize_mat_q8_K_4x8
54
- #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
55
62
  #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
56
63
  #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
57
- #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
58
64
  #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
59
65
  #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
60
66
  #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
61
67
  // repack.cpp
62
68
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
69
+ #define wsp_ggml_wsp_quantize_mat_q8_K_4x4_generic wsp_ggml_wsp_quantize_mat_q8_K_4x4
63
70
  #define wsp_ggml_gemv_q4_0_4x4_q8_0_generic wsp_ggml_gemv_q4_0_4x4_q8_0
64
71
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
72
+ #define wsp_ggml_gemv_q4_K_8x4_q8_K_generic wsp_ggml_gemv_q4_K_8x4_q8_K
65
73
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
74
+ #define wsp_ggml_gemv_q8_0_4x4_q8_0_generic wsp_ggml_gemv_q8_0_4x4_q8_0
75
+ #define wsp_ggml_gemv_q8_0_4x8_q8_0_generic wsp_ggml_gemv_q8_0_4x8_q8_0
66
76
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
67
77
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
78
+ #define wsp_ggml_gemm_q4_K_8x4_q8_K_generic wsp_ggml_gemm_q4_K_8x4_q8_K
68
79
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
80
+ #define wsp_ggml_gemm_q8_0_4x4_q8_0_generic wsp_ggml_gemm_q8_0_4x4_q8_0
81
+ #define wsp_ggml_gemm_q8_0_4x8_q8_0_generic wsp_ggml_gemm_q8_0_4x8_q8_0
69
82
  #elif defined(__POWERPC__) || defined(__powerpc__)
70
83
  // ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
71
84
  // quants.c
@@ -76,21 +89,28 @@
76
89
  // repack.cpp
77
90
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
78
91
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
92
+ #define wsp_ggml_wsp_quantize_mat_q8_K_4x4_generic wsp_ggml_wsp_quantize_mat_q8_K_4x4
79
93
  #define wsp_ggml_wsp_quantize_mat_q8_K_4x8_generic wsp_ggml_wsp_quantize_mat_q8_K_4x8
80
94
  #define wsp_ggml_gemv_q4_0_4x4_q8_0_generic wsp_ggml_gemv_q4_0_4x4_q8_0
81
95
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
82
96
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
97
+ #define wsp_ggml_gemv_q4_K_8x4_q8_K_generic wsp_ggml_gemv_q4_K_8x4_q8_K
83
98
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
84
99
  #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
85
100
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
86
101
  #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
102
+ #define wsp_ggml_gemv_q8_0_4x4_q8_0_generic wsp_ggml_gemv_q8_0_4x4_q8_0
103
+ #define wsp_ggml_gemv_q8_0_4x8_q8_0_generic wsp_ggml_gemv_q8_0_4x8_q8_0
87
104
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
88
105
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
89
106
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
107
+ #define wsp_ggml_gemm_q4_K_8x4_q8_K_generic wsp_ggml_gemm_q4_K_8x4_q8_K
90
108
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
91
109
  #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
92
110
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
93
111
  #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
112
+ #define wsp_ggml_gemm_q8_0_4x4_q8_0_generic wsp_ggml_gemm_q8_0_4x4_q8_0
113
+ #define wsp_ggml_gemm_q8_0_4x8_q8_0_generic wsp_ggml_gemm_q8_0_4x8_q8_0
94
114
  #elif defined(__loongarch64)
95
115
  // quants.c
96
116
  #define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
@@ -101,21 +121,28 @@
101
121
  // repack.cpp
102
122
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
103
123
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
124
+ #define wsp_ggml_wsp_quantize_mat_q8_K_4x4_generic wsp_ggml_wsp_quantize_mat_q8_K_4x4
104
125
  #define wsp_ggml_wsp_quantize_mat_q8_K_4x8_generic wsp_ggml_wsp_quantize_mat_q8_K_4x8
105
126
  #define wsp_ggml_gemv_q4_0_4x4_q8_0_generic wsp_ggml_gemv_q4_0_4x4_q8_0
106
127
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
107
128
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
129
+ #define wsp_ggml_gemv_q4_K_8x4_q8_K_generic wsp_ggml_gemv_q4_K_8x4_q8_K
108
130
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
109
131
  #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
110
132
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
111
133
  #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
134
+ #define wsp_ggml_gemv_q8_0_4x4_q8_0_generic wsp_ggml_gemv_q8_0_4x4_q8_0
135
+ #define wsp_ggml_gemv_q8_0_4x8_q8_0_generic wsp_ggml_gemv_q8_0_4x8_q8_0
112
136
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
113
137
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
114
138
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
139
+ #define wsp_ggml_gemm_q4_K_8x4_q8_K_generic wsp_ggml_gemm_q4_K_8x4_q8_K
115
140
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
116
141
  #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
117
142
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
118
143
  #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
144
+ #define wsp_ggml_gemm_q8_0_4x4_q8_0_generic wsp_ggml_gemm_q8_0_4x4_q8_0
145
+ #define wsp_ggml_gemm_q8_0_4x8_q8_0_generic wsp_ggml_gemm_q8_0_4x8_q8_0
119
146
  #elif defined(__riscv)
120
147
  // quants.c
121
148
  #define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
@@ -134,19 +161,26 @@
134
161
  // repack.cpp
135
162
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
136
163
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
164
+ #define wsp_ggml_wsp_quantize_mat_q8_K_4x4_generic wsp_ggml_wsp_quantize_mat_q8_K_4x4
137
165
  #define wsp_ggml_wsp_quantize_mat_q8_K_4x8_generic wsp_ggml_wsp_quantize_mat_q8_K_4x8
138
166
  #define wsp_ggml_gemv_q4_0_4x4_q8_0_generic wsp_ggml_gemv_q4_0_4x4_q8_0
139
167
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
168
+ #define wsp_ggml_gemv_q4_K_8x4_q8_K_generic wsp_ggml_gemv_q4_K_8x4_q8_K
140
169
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
141
170
  #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
142
171
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
143
172
  #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
173
+ #define wsp_ggml_gemv_q8_0_4x4_q8_0_generic wsp_ggml_gemv_q8_0_4x4_q8_0
174
+ #define wsp_ggml_gemv_q8_0_4x8_q8_0_generic wsp_ggml_gemv_q8_0_4x8_q8_0
144
175
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
145
176
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
177
+ #define wsp_ggml_gemm_q4_K_8x4_q8_K_generic wsp_ggml_gemm_q4_K_8x4_q8_K
146
178
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
147
179
  #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
148
180
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
149
181
  #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
182
+ #define wsp_ggml_gemm_q8_0_4x4_q8_0_generic wsp_ggml_gemm_q8_0_4x4_q8_0
183
+ #define wsp_ggml_gemm_q8_0_4x8_q8_0_generic wsp_ggml_gemm_q8_0_4x8_q8_0
150
184
  #elif defined(__s390x__)
151
185
  // quants.c
152
186
  #define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
@@ -163,21 +197,28 @@
163
197
  // repack.cpp
164
198
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
165
199
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
200
+ #define wsp_ggml_wsp_quantize_mat_q8_K_4x4_generic wsp_ggml_wsp_quantize_mat_q8_K_4x4
166
201
  #define wsp_ggml_wsp_quantize_mat_q8_K_4x8_generic wsp_ggml_wsp_quantize_mat_q8_K_4x8
167
202
  #define wsp_ggml_gemv_q4_0_4x4_q8_0_generic wsp_ggml_gemv_q4_0_4x4_q8_0
168
203
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
169
204
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
205
+ #define wsp_ggml_gemv_q4_K_8x4_q8_K_generic wsp_ggml_gemv_q4_K_8x4_q8_K
170
206
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
171
207
  #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
172
208
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
173
209
  #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
210
+ #define wsp_ggml_gemv_q8_0_4x4_q8_0_generic wsp_ggml_gemv_q8_0_4x4_q8_0
211
+ #define wsp_ggml_gemv_q8_0_4x8_q8_0_generic wsp_ggml_gemv_q8_0_4x8_q8_0
174
212
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
175
213
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
176
214
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
215
+ #define wsp_ggml_gemm_q4_K_8x4_q8_K_generic wsp_ggml_gemm_q4_K_8x4_q8_K
177
216
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
178
217
  #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
179
218
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
180
219
  #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
220
+ #define wsp_ggml_gemm_q8_0_4x4_q8_0_generic wsp_ggml_gemm_q8_0_4x4_q8_0
221
+ #define wsp_ggml_gemm_q8_0_4x8_q8_0_generic wsp_ggml_gemm_q8_0_4x8_q8_0
181
222
  #elif defined(__wasm__)
182
223
  // quants.c
183
224
  #define wsp_ggml_vec_dot_q4_1_q8_1_generic wsp_ggml_vec_dot_q4_1_q8_1
@@ -196,19 +237,26 @@
196
237
  // repack.cpp
197
238
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
198
239
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
240
+ #define wsp_ggml_wsp_quantize_mat_q8_K_4x4_generic wsp_ggml_wsp_quantize_mat_q8_K_4x4
199
241
  #define wsp_ggml_wsp_quantize_mat_q8_K_4x8_generic wsp_ggml_wsp_quantize_mat_q8_K_4x8
200
242
  #define wsp_ggml_gemv_q4_0_4x4_q8_0_generic wsp_ggml_gemv_q4_0_4x4_q8_0
201
243
  #define wsp_ggml_gemv_q4_0_4x8_q8_0_generic wsp_ggml_gemv_q4_0_4x8_q8_0
202
244
  #define wsp_ggml_gemv_q4_0_8x8_q8_0_generic wsp_ggml_gemv_q4_0_8x8_q8_0
245
+ #define wsp_ggml_gemv_q4_K_8x4_q8_K_generic wsp_ggml_gemv_q4_K_8x4_q8_K
203
246
  #define wsp_ggml_gemv_q4_K_8x8_q8_K_generic wsp_ggml_gemv_q4_K_8x8_q8_K
204
247
  #define wsp_ggml_gemv_q2_K_8x8_q8_K_generic wsp_ggml_gemv_q2_K_8x8_q8_K
205
248
  #define wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic wsp_ggml_gemv_iq4_nl_4x4_q8_0
206
249
  #define wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic wsp_ggml_gemv_iq4_nl_8x8_q8_0
250
+ #define wsp_ggml_gemv_q8_0_4x4_q8_0_generic wsp_ggml_gemv_q8_0_4x4_q8_0
251
+ #define wsp_ggml_gemv_q8_0_4x8_q8_0_generic wsp_ggml_gemv_q8_0_4x8_q8_0
207
252
  #define wsp_ggml_gemm_q4_0_4x4_q8_0_generic wsp_ggml_gemm_q4_0_4x4_q8_0
208
253
  #define wsp_ggml_gemm_q4_0_4x8_q8_0_generic wsp_ggml_gemm_q4_0_4x8_q8_0
209
254
  #define wsp_ggml_gemm_q4_0_8x8_q8_0_generic wsp_ggml_gemm_q4_0_8x8_q8_0
255
+ #define wsp_ggml_gemm_q4_K_8x4_q8_K_generic wsp_ggml_gemm_q4_K_8x4_q8_K
210
256
  #define wsp_ggml_gemm_q4_K_8x8_q8_K_generic wsp_ggml_gemm_q4_K_8x8_q8_K
211
257
  #define wsp_ggml_gemm_q2_K_8x8_q8_K_generic wsp_ggml_gemm_q2_K_8x8_q8_K
212
258
  #define wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic wsp_ggml_gemm_iq4_nl_4x4_q8_0
213
259
  #define wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic wsp_ggml_gemm_iq4_nl_8x8_q8_0
260
+ #define wsp_ggml_gemm_q8_0_4x4_q8_0_generic wsp_ggml_gemm_q8_0_4x4_q8_0
261
+ #define wsp_ggml_gemm_q8_0_4x8_q8_0_generic wsp_ggml_gemm_q8_0_4x8_q8_0
214
262
  #endif
@@ -328,7 +328,7 @@ inline static int32x4_t wsp_ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t
328
328
 
329
329
  #if defined(_MSC_VER) || defined(__MINGW32__)
330
330
  #include <intrin.h>
331
- #elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
331
+ #elif defined(__SSE__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__AVX__) || defined(__F16C__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX512BF16__)
332
332
  #include <immintrin.h>
333
333
  #endif
334
334
 
@@ -81,6 +81,11 @@ struct wsp_ggml_arm_arch_features_type {
81
81
  } wsp_ggml_arm_arch_features = { 0 };
82
82
  #endif
83
83
 
84
+ #if defined(__riscv)
85
+ struct wsp_ggml_riscv_arch_features_type {
86
+ int rvv_vlen;
87
+ } wsp_ggml_riscv_arch_features = { 0 };
88
+ #endif
84
89
 
85
90
  #if defined(_WIN32)
86
91
 
@@ -187,6 +192,9 @@ typedef void * thread_ret_t;
187
192
 
188
193
  typedef pthread_t wsp_ggml_thread_t;
189
194
 
195
+ #define WSP_GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
196
+ #define WSP_GGML_THREADPOOL_N_THREADS_BITS (16)
197
+
190
198
  #if defined(__APPLE__)
191
199
  #include <unistd.h>
192
200
  #include <mach/mach.h>
@@ -449,7 +457,7 @@ struct wsp_ggml_threadpool {
449
457
  struct wsp_ggml_cplan * cplan;
450
458
 
451
459
  // synchronization primitives
452
- atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
460
+ atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
453
461
  atomic_int WSP_GGML_CACHE_ALIGN n_barrier;
454
462
  atomic_int WSP_GGML_CACHE_ALIGN n_barrier_passed;
455
463
  atomic_int WSP_GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@@ -457,12 +465,10 @@ struct wsp_ggml_threadpool {
457
465
  // these are atomic as an annotation for thread-sanitizer
458
466
  atomic_bool stop; // Used for stopping the threadpool altogether
459
467
  atomic_bool pause; // Used for pausing the threadpool or individual threads
460
- atomic_int abort; // Used for aborting processing of a graph
468
+ atomic_int abort; // Used for aborting processing of a graph
461
469
 
462
470
  struct wsp_ggml_compute_state * workers; // per thread state
463
- int n_threads_max; // number of threads in the pool
464
- atomic_int n_threads_cur; // number of threads used in the current graph
465
-
471
+ int n_threads; // Number of threads in the pool
466
472
  int32_t prio; // Scheduling priority
467
473
  uint32_t poll; // Polling level (0 - no polling)
468
474
 
@@ -490,6 +496,15 @@ static inline void wsp_ggml_thread_cpu_relax(void) {
490
496
  static inline void wsp_ggml_thread_cpu_relax(void) {
491
497
  _mm_pause();
492
498
  }
499
+ #elif defined(__riscv)
500
+ static inline void wsp_ggml_thread_cpu_relax(void) {
501
+ #ifdef __riscv_zihintpause
502
+ __asm__ __volatile__ ("pause");
503
+ #else
504
+ /* Encoding of the pause instruction */
505
+ __asm__ __volatile__ (".4byte 0x100000F");
506
+ #endif
507
+ }
493
508
  #else
494
509
  static inline void wsp_ggml_thread_cpu_relax(void) {;}
495
510
  #endif
@@ -530,7 +545,7 @@ struct wsp_ggml_state {
530
545
  static struct wsp_ggml_state g_state = {0};
531
546
 
532
547
  void wsp_ggml_barrier(struct wsp_ggml_threadpool * tp) {
533
- int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
548
+ int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & WSP_GGML_THREADPOOL_N_THREADS_MASK;
534
549
  if (n_threads == 1) {
535
550
  return;
536
551
  }
@@ -547,7 +562,7 @@ void wsp_ggml_barrier(struct wsp_ggml_threadpool * tp) {
547
562
  // last thread
548
563
  atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
549
564
 
550
- // exit barrier (fill seq-cst fence)
565
+ // exit barrier (full seq-cst fence)
551
566
  atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
552
567
  return;
553
568
  }
@@ -683,24 +698,25 @@ bool wsp_ggml_is_numa(void) {
683
698
  }
684
699
 
685
700
  #if defined(__ARM_ARCH)
686
-
687
- #if defined(__linux__) && defined(__aarch64__)
688
- #include <sys/auxv.h>
689
- #endif
690
-
691
- static void wsp_ggml_init_arm_arch_features(void) {
692
701
  #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
693
- #if defined(__linux__)
694
- wsp_ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
702
+ #include <arm_sve.h>
703
+ static void wsp_ggml_init_arm_arch_features(void) {
704
+ wsp_ggml_arm_arch_features.sve_cnt = svcntb();
705
+ }
695
706
  #else
696
- // TODO: add support of SVE for non-linux systems
697
- #error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
698
- #endif
707
+ static void wsp_ggml_init_arm_arch_features(void) {}
699
708
  #endif
700
- }
701
-
702
709
  #endif // __ARM_ARCH
703
710
 
711
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
712
+ #include <riscv_vector.h>
713
+ static void wsp_ggml_init_riscv_arch_features(void) {
714
+ wsp_ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
715
+ }
716
+ #else
717
+ static void wsp_ggml_init_riscv_arch_features(void) {}
718
+ #endif
719
+
704
720
  struct wsp_ggml_tensor * wsp_ggml_new_i32(struct wsp_ggml_context * ctx, int32_t value) {
705
721
  WSP_GGML_ASSERT(!wsp_ggml_get_no_alloc(ctx));
706
722
 
@@ -1927,6 +1943,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
1927
1943
  {
1928
1944
  wsp_ggml_compute_forward_argsort(params, tensor);
1929
1945
  } break;
1946
+ case WSP_GGML_OP_TOP_K:
1947
+ {
1948
+ wsp_ggml_compute_forward_top_k(params, tensor);
1949
+ } break;
1930
1950
  case WSP_GGML_OP_LEAKY_RELU:
1931
1951
  {
1932
1952
  wsp_ggml_compute_forward_leaky_relu(params, tensor);
@@ -2311,6 +2331,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2311
2331
  case WSP_GGML_OP_ARANGE:
2312
2332
  case WSP_GGML_OP_TIMESTEP_EMBEDDING:
2313
2333
  case WSP_GGML_OP_ARGSORT:
2334
+ case WSP_GGML_OP_TOP_K:
2314
2335
  case WSP_GGML_OP_FLASH_ATTN_EXT:
2315
2336
  case WSP_GGML_OP_FLASH_ATTN_BACK:
2316
2337
  case WSP_GGML_OP_SSM_CONV:
@@ -2622,7 +2643,7 @@ static void wsp_ggml_thread_cpumask_next(const bool * global_mask, bool * local_
2622
2643
  void wsp_ggml_threadpool_free(struct wsp_ggml_threadpool* threadpool) {
2623
2644
  if (!threadpool) return;
2624
2645
 
2625
- const int n_threads = threadpool->n_threads_max;
2646
+ const int n_threads = threadpool->n_threads;
2626
2647
 
2627
2648
  #ifndef WSP_GGML_USE_OPENMP
2628
2649
  struct wsp_ggml_compute_state* workers = threadpool->workers;
@@ -2698,9 +2719,14 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
2698
2719
  //WSP_GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
2699
2720
  }
2700
2721
  if (n_threads <= 0) {
2701
- n_threads = threadpool ? threadpool->n_threads_max : WSP_GGML_DEFAULT_N_THREADS;
2722
+ n_threads = threadpool ? threadpool->n_threads : WSP_GGML_DEFAULT_N_THREADS;
2702
2723
  }
2703
2724
 
2725
+ #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
2726
+ // Emscripten without pthreads support can only use a single thread
2727
+ n_threads = 1;
2728
+ #endif
2729
+
2704
2730
  size_t work_size = 0;
2705
2731
 
2706
2732
  struct wsp_ggml_cplan cplan;
@@ -2834,6 +2860,10 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
2834
2860
  cur += sizeof(wsp_ggml_fp16_t)*ne00*ne01*ne02*ne03;
2835
2861
  cur += sizeof(wsp_ggml_fp16_t)*ne10*ne11*ne12;
2836
2862
  } break;
2863
+ case WSP_GGML_OP_TOP_K:
2864
+ {
2865
+ cur += sizeof(int32_t)*node->src[0]->ne[0]*n_tasks;
2866
+ } break;
2837
2867
  case WSP_GGML_OP_FLASH_ATTN_EXT:
2838
2868
  {
2839
2869
  const int64_t ne10 = node->src[1]->ne[0]; // DK
@@ -2897,12 +2927,14 @@ static thread_ret_t wsp_ggml_graph_compute_thread(void * data) {
2897
2927
 
2898
2928
  struct wsp_ggml_compute_params params = {
2899
2929
  /*.ith =*/ state->ith,
2900
- /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
2930
+ /*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & WSP_GGML_THREADPOOL_N_THREADS_MASK,
2901
2931
  /*.wsize =*/ cplan->work_size,
2902
2932
  /*.wdata =*/ cplan->work_data,
2903
2933
  /*.threadpool=*/ tp,
2904
2934
  };
2905
2935
 
2936
+ WSP_GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
2937
+
2906
2938
  for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
2907
2939
  struct wsp_ggml_tensor * node = cgraph->nodes[node_n];
2908
2940
 
@@ -2924,6 +2956,8 @@ static thread_ret_t wsp_ggml_graph_compute_thread(void * data) {
2924
2956
  }
2925
2957
  }
2926
2958
 
2959
+ WSP_GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
2960
+
2927
2961
  wsp_ggml_barrier(state->threadpool);
2928
2962
 
2929
2963
  return 0;
@@ -2931,27 +2965,23 @@ static thread_ret_t wsp_ggml_graph_compute_thread(void * data) {
2931
2965
 
2932
2966
  #ifndef WSP_GGML_USE_OPENMP
2933
2967
 
2934
- // check if thread is active
2935
- static inline bool wsp_ggml_graph_compute_thread_active(struct wsp_ggml_compute_state * state) {
2936
- struct wsp_ggml_threadpool * threadpool = state->threadpool;
2937
- int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
2938
- return (state->ith < n_threads);
2939
- }
2940
-
2941
2968
  // check if thread is ready to proceed (exit from polling or sleeping)
2969
+ // returns true if loops should exit, sets state->pending to indicate new work
2942
2970
  static inline bool wsp_ggml_graph_compute_thread_ready(struct wsp_ggml_compute_state * state) {
2943
2971
  struct wsp_ggml_threadpool * threadpool = state->threadpool;
2944
2972
 
2945
2973
  if (state->pending || threadpool->stop || threadpool->pause) { return true; }
2946
2974
 
2947
2975
  // check for new graph/work
2948
- int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2949
- if (new_graph != state->last_graph) {
2950
- state->pending = wsp_ggml_graph_compute_thread_active(state);
2951
- state->last_graph = new_graph;
2976
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2977
+ int n_threads = n_graph & WSP_GGML_THREADPOOL_N_THREADS_MASK;
2978
+ if (n_graph != state->last_graph) {
2979
+ state->pending = (state->ith < n_threads);
2980
+ state->last_graph = n_graph;
2981
+ return true;
2952
2982
  }
2953
2983
 
2954
- return state->pending;
2984
+ return false;
2955
2985
  }
2956
2986
 
2957
2987
  // sync thread state after polling
@@ -2968,11 +2998,6 @@ static inline void wsp_ggml_graph_compute_thread_sync(struct wsp_ggml_compute_st
2968
2998
  static inline bool wsp_ggml_graph_compute_poll_for_work(struct wsp_ggml_compute_state * state) {
2969
2999
  struct wsp_ggml_threadpool * threadpool = state->threadpool;
2970
3000
 
2971
- // Skip polling for unused threads
2972
- if (!wsp_ggml_graph_compute_thread_active(state)) {
2973
- return state->pending;
2974
- }
2975
-
2976
3001
  // This seems to make 0 ... 100 a decent range for polling level across modern processors.
2977
3002
  // Perhaps, we can adjust it dynamically based on load and things.
2978
3003
  const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
@@ -3034,7 +3059,6 @@ static thread_ret_t wsp_ggml_graph_compute_secondary_thread(void* data) {
3034
3059
  wsp_ggml_graph_compute_check_for_work(state);
3035
3060
  if (state->pending) {
3036
3061
  state->pending = false;
3037
-
3038
3062
  wsp_ggml_graph_compute_thread(state);
3039
3063
  }
3040
3064
  }
@@ -3049,14 +3073,15 @@ static void wsp_ggml_graph_compute_kickoff(struct wsp_ggml_threadpool * threadpo
3049
3073
 
3050
3074
  wsp_ggml_mutex_lock(&threadpool->mutex);
3051
3075
 
3052
- WSP_GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
3076
+ // Update the number of active threads and the graph count
3077
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> WSP_GGML_THREADPOOL_N_THREADS_BITS;
3078
+ n_graph = ((n_graph + 1) << WSP_GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & WSP_GGML_THREADPOOL_N_THREADS_MASK);
3053
3079
 
3054
- // Update the number of active threads
3055
- atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3080
+ WSP_GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
3056
3081
 
3057
3082
  // Indicate the graph is ready to be processed
3058
3083
  // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
3059
- atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
3084
+ atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
3060
3085
 
3061
3086
  if (threadpool->pause) {
3062
3087
  // Update main thread prio and affinity to match the threadpool settings
@@ -3094,8 +3119,7 @@ static struct wsp_ggml_threadpool * wsp_ggml_threadpool_new_impl(
3094
3119
  threadpool->pause = tpp->paused;
3095
3120
  threadpool->abort = -1;
3096
3121
  threadpool->workers = NULL;
3097
- threadpool->n_threads_max = tpp->n_threads;
3098
- threadpool->n_threads_cur = tpp->n_threads;
3122
+ threadpool->n_threads = tpp->n_threads;
3099
3123
  threadpool->poll = tpp->poll;
3100
3124
  threadpool->prio = tpp->prio;
3101
3125
  threadpool->ec = WSP_GGML_STATUS_SUCCESS;
@@ -3190,7 +3214,7 @@ enum wsp_ggml_status wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, str
3190
3214
  {
3191
3215
  // update the number of threads from the actual number of threads that we got from OpenMP
3192
3216
  n_threads = omp_get_num_threads();
3193
- atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3217
+ atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
3194
3218
  }
3195
3219
 
3196
3220
  // Apply thread CPU mask and priority
@@ -3203,13 +3227,13 @@ enum wsp_ggml_status wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, str
3203
3227
  wsp_ggml_graph_compute_thread(&threadpool->workers[ith]);
3204
3228
  }
3205
3229
  } else {
3206
- atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
3230
+ atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
3207
3231
  wsp_ggml_graph_compute_thread(&threadpool->workers[0]);
3208
3232
  }
3209
3233
  #else
3210
- if (n_threads > threadpool->n_threads_max) {
3211
- WSP_GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
3212
- n_threads = threadpool->n_threads_max;
3234
+ if (n_threads > threadpool->n_threads) {
3235
+ WSP_GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
3236
+ n_threads = threadpool->n_threads;
3213
3237
  }
3214
3238
 
3215
3239
  // Kick all threads to start the new graph
@@ -3296,13 +3320,33 @@ void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t * x, float * y, int64_t n)
3296
3320
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3297
3321
  _mm_storeu_ps(y + i, y_vec);
3298
3322
  }
3299
- #elif defined(__riscv_zvfh)
3300
- for (int vl; i < n; i += vl) {
3301
- vl = __riscv_vsetvl_e16m1(n - i);
3302
- vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl);
3303
- vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl);
3304
- __riscv_vse32_v_f32m2(&y[i], vy, vl);
3323
+
3324
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
3325
+ // calculate step size
3326
+ const int epr = __riscv_vsetvlmax_e16m2();
3327
+ const int step = epr * 2;
3328
+ const int np = (n & ~(step - 1));
3329
+
3330
+ // unroll by 2
3331
+ for (; i < np; i += step) {
3332
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
3333
+ vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
3334
+ __riscv_vse32_v_f32m4(y + i, ay0, epr);
3335
+
3336
+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
3337
+ vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
3338
+ __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3305
3339
  }
3340
+
3341
+ // leftovers
3342
+ int vl;
3343
+ for (i = np; i < n; i += vl) {
3344
+ vl = __riscv_vsetvl_e16m2(n - i);
3345
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
3346
+ vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
3347
+ __riscv_vse32_v_f32m4(y + i, ay0, vl);
3348
+ }
3349
+
3306
3350
  #endif
3307
3351
 
3308
3352
  for (; i < n; ++i) {
@@ -3347,6 +3391,31 @@ void wsp_ggml_cpu_bf16_to_fp32(const wsp_ggml_bf16_t * x, float * y, int64_t n)
3347
3391
  (const __m128i *)(x + i))),
3348
3392
  16)));
3349
3393
  }
3394
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
3395
+ // calculate step size
3396
+ const int epr = __riscv_vsetvlmax_e16m2();
3397
+ const int step = epr * 2;
3398
+ const int np = (n & ~(step - 1));
3399
+
3400
+ // unroll by 2
3401
+ for (; i < np; i += step) {
3402
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
3403
+ vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
3404
+ __riscv_vse32_v_f32m4(y + i, ay0, epr);
3405
+
3406
+ vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
3407
+ vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
3408
+ __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3409
+ }
3410
+
3411
+ // leftovers
3412
+ int vl;
3413
+ for (i = np; i < n; i += vl) {
3414
+ vl = __riscv_vsetvl_e16m2(n - i);
3415
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
3416
+ vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
3417
+ __riscv_vse32_v_f32m4(y + i, ay0, vl);
3418
+ }
3350
3419
  #endif
3351
3420
  for (; i < n; i++) {
3352
3421
  y[i] = WSP_GGML_BF16_TO_FP32(x[i]);
@@ -3449,6 +3518,14 @@ int wsp_ggml_cpu_has_riscv_v(void) {
3449
3518
  #endif
3450
3519
  }
3451
3520
 
3521
+ int wsp_ggml_cpu_get_rvv_vlen(void) {
3522
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
3523
+ return wsp_ggml_riscv_arch_features.rvv_vlen;
3524
+ #else
3525
+ return 0;
3526
+ #endif
3527
+ }
3528
+
3452
3529
  int wsp_ggml_cpu_has_f16c(void) {
3453
3530
  #if defined(__F16C__)
3454
3531
  return 1;
@@ -3615,6 +3692,10 @@ void wsp_ggml_cpu_init(void) {
3615
3692
  wsp_ggml_init_arm_arch_features();
3616
3693
  #endif
3617
3694
 
3695
+ #if defined(__riscv)
3696
+ wsp_ggml_init_riscv_arch_features();
3697
+ #endif
3698
+
3618
3699
  is_first_call = false;
3619
3700
  }
3620
3701