whisper.rn 0.4.0-rc.9 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/README.md +74 -1
  2. package/android/build.gradle +12 -3
  3. package/android/src/main/CMakeLists.txt +43 -13
  4. package/android/src/main/java/com/rnwhisper/RNWhisper.java +211 -0
  5. package/android/src/main/java/com/rnwhisper/WhisperContext.java +64 -36
  6. package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +157 -0
  7. package/android/src/main/jni.cpp +205 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
  15. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
  16. package/cpp/coreml/whisper-compat.h +10 -0
  17. package/cpp/coreml/whisper-compat.m +35 -0
  18. package/cpp/coreml/whisper-decoder-impl.h +27 -15
  19. package/cpp/coreml/whisper-decoder-impl.m +36 -10
  20. package/cpp/coreml/whisper-encoder-impl.h +21 -9
  21. package/cpp/coreml/whisper-encoder-impl.m +29 -3
  22. package/cpp/ggml-alloc.c +39 -37
  23. package/cpp/ggml-alloc.h +1 -1
  24. package/cpp/ggml-backend-impl.h +55 -27
  25. package/cpp/ggml-backend-reg.cpp +591 -0
  26. package/cpp/ggml-backend.cpp +336 -955
  27. package/cpp/ggml-backend.h +70 -42
  28. package/cpp/ggml-common.h +57 -49
  29. package/cpp/ggml-cpp.h +39 -0
  30. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  31. package/cpp/ggml-cpu/amx/amx.h +8 -0
  32. package/cpp/ggml-cpu/amx/common.h +91 -0
  33. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  34. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  35. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  36. package/cpp/ggml-cpu/arch/arm/quants.c +4113 -0
  37. package/cpp/ggml-cpu/arch/arm/repack.cpp +2162 -0
  38. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  39. package/cpp/ggml-cpu/arch/x86/quants.c +4310 -0
  40. package/cpp/ggml-cpu/arch/x86/repack.cpp +3284 -0
  41. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  42. package/cpp/ggml-cpu/binary-ops.cpp +158 -0
  43. package/cpp/ggml-cpu/binary-ops.h +16 -0
  44. package/cpp/ggml-cpu/common.h +72 -0
  45. package/cpp/ggml-cpu/ggml-cpu-impl.h +511 -0
  46. package/cpp/ggml-cpu/ggml-cpu.c +3473 -0
  47. package/cpp/ggml-cpu/ggml-cpu.cpp +671 -0
  48. package/cpp/ggml-cpu/ops.cpp +9085 -0
  49. package/cpp/ggml-cpu/ops.h +111 -0
  50. package/cpp/ggml-cpu/quants.c +1157 -0
  51. package/cpp/ggml-cpu/quants.h +89 -0
  52. package/cpp/ggml-cpu/repack.cpp +1570 -0
  53. package/cpp/ggml-cpu/repack.h +98 -0
  54. package/cpp/ggml-cpu/simd-mappings.h +1006 -0
  55. package/cpp/ggml-cpu/traits.cpp +36 -0
  56. package/cpp/ggml-cpu/traits.h +38 -0
  57. package/cpp/ggml-cpu/unary-ops.cpp +186 -0
  58. package/cpp/ggml-cpu/unary-ops.h +28 -0
  59. package/cpp/ggml-cpu/vec.cpp +321 -0
  60. package/cpp/ggml-cpu/vec.h +973 -0
  61. package/cpp/ggml-cpu.h +143 -0
  62. package/cpp/ggml-impl.h +417 -23
  63. package/cpp/ggml-metal-impl.h +622 -0
  64. package/cpp/ggml-metal.h +9 -9
  65. package/cpp/ggml-metal.m +3451 -1344
  66. package/cpp/ggml-opt.cpp +1037 -0
  67. package/cpp/ggml-opt.h +237 -0
  68. package/cpp/ggml-quants.c +296 -10818
  69. package/cpp/ggml-quants.h +78 -125
  70. package/cpp/ggml-threading.cpp +12 -0
  71. package/cpp/ggml-threading.h +14 -0
  72. package/cpp/ggml-whisper-sim.metallib +0 -0
  73. package/cpp/ggml-whisper.metallib +0 -0
  74. package/cpp/ggml.c +4633 -21450
  75. package/cpp/ggml.h +320 -661
  76. package/cpp/gguf.cpp +1347 -0
  77. package/cpp/gguf.h +202 -0
  78. package/cpp/rn-whisper.cpp +4 -11
  79. package/cpp/whisper-arch.h +197 -0
  80. package/cpp/whisper.cpp +2022 -495
  81. package/cpp/whisper.h +75 -18
  82. package/ios/CMakeLists.txt +95 -0
  83. package/ios/RNWhisper.h +5 -0
  84. package/ios/RNWhisper.mm +147 -0
  85. package/ios/RNWhisperAudioUtils.m +4 -0
  86. package/ios/RNWhisperContext.h +5 -0
  87. package/ios/RNWhisperContext.mm +22 -26
  88. package/ios/RNWhisperVadContext.h +29 -0
  89. package/ios/RNWhisperVadContext.mm +152 -0
  90. package/ios/rnwhisper.xcframework/Info.plist +74 -0
  91. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  92. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  93. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  94. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  95. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  96. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  97. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  98. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  99. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  100. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  101. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  102. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  103. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
  104. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
  105. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  106. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  107. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  108. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  109. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
  110. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  111. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  112. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  113. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  114. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  115. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  116. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  117. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  118. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  119. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  120. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  121. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  122. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  123. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  124. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  125. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
  126. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
  127. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  128. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  129. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  130. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  131. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
  132. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  133. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
  134. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  135. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  136. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  137. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  138. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  139. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  140. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  141. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  142. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  143. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  144. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  145. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  146. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  147. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  148. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
  149. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
  150. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  151. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  152. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  153. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  154. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
  155. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  156. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  157. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  158. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  159. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  160. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  161. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  162. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  163. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  164. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  165. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  166. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  167. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  168. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  169. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  170. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
  171. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
  172. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  173. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  174. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  175. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  176. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
  177. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  178. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
  179. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  180. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  181. package/jest/mock.js +24 -0
  182. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  183. package/lib/commonjs/index.js +111 -1
  184. package/lib/commonjs/index.js.map +1 -1
  185. package/lib/commonjs/version.json +1 -1
  186. package/lib/module/NativeRNWhisper.js.map +1 -1
  187. package/lib/module/index.js +112 -0
  188. package/lib/module/index.js.map +1 -1
  189. package/lib/module/version.json +1 -1
  190. package/lib/typescript/NativeRNWhisper.d.ts +35 -0
  191. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  192. package/lib/typescript/index.d.ts +39 -3
  193. package/lib/typescript/index.d.ts.map +1 -1
  194. package/package.json +10 -6
  195. package/src/NativeRNWhisper.ts +48 -0
  196. package/src/index.ts +132 -1
  197. package/src/version.json +1 -1
  198. package/whisper-rn.podspec +11 -18
  199. package/cpp/README.md +0 -4
  200. package/cpp/ggml-aarch64.c +0 -3209
  201. package/cpp/ggml-aarch64.h +0 -39
  202. package/cpp/ggml-cpu-impl.h +0 -614
package/cpp/whisper.h CHANGED
@@ -2,6 +2,7 @@
2
2
  #define WHISPER_H
3
3
 
4
4
  #include "ggml.h"
5
+ #include "ggml-cpu.h"
5
6
 
6
7
  #include <stddef.h>
7
8
  #include <stdint.h>
@@ -189,6 +190,15 @@ extern "C" {
189
190
  uint32_t value; // Unicode code point or rule ID
190
191
  } whisper_grammar_element;
191
192
 
193
+ typedef struct whisper_vad_params {
194
+ float threshold; // Probability threshold to consider as speech.
195
+ int min_speech_duration_ms; // Min duration for a valid speech segment.
196
+ int min_silence_duration_ms; // Min silence duration to consider speech as ended.
197
+ float max_speech_duration_s; // Max duration of a speech segment before forcing a new segment.
198
+ int speech_pad_ms; // Padding added before and after speech segments.
199
+ float samples_overlap; // Overlap in seconds when copying audio samples from speech segment.
200
+ } whisper_vad_params;
201
+
192
202
  // Various functions for loading a ggml whisper model.
193
203
  // Allocate (almost) all memory needed for the model.
194
204
  // Return NULL on failure
@@ -425,21 +435,11 @@ extern "C" {
425
435
 
426
436
  // Performance information from the default state.
427
437
  struct whisper_timings {
428
- int64_t load_us;
429
- int64_t t_start_us;
430
- int32_t fail_p;
431
- int32_t fail_h;
432
- int64_t t_mel_us;
433
- int32_t n_sample;
434
- int32_t n_encode;
435
- int32_t n_decode;
436
- int32_t n_batchd;
437
- int32_t n_prompt;
438
- int64_t t_sample_us;
439
- int64_t t_encode_us;
440
- int64_t t_decode_us;
441
- int64_t t_batchd_us;
442
- int64_t t_prompt_us;
438
+ float sample_ms;
439
+ float encode_ms;
440
+ float decode_ms;
441
+ float batchd_ms;
442
+ float prompt_ms;
443
443
  };
444
444
  WHISPER_API struct whisper_timings * whisper_get_timings(struct whisper_context * ctx);
445
445
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
@@ -532,8 +532,8 @@ extern "C" {
532
532
  bool detect_language;
533
533
 
534
534
  // common decoding parameters:
535
- bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
536
- bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
535
+ bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
536
+ bool suppress_nst; // non-speech tokens, ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
537
537
 
538
538
  float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
539
539
  float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
@@ -544,7 +544,7 @@ extern "C" {
544
544
  float temperature_inc;
545
545
  float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
546
546
  float logprob_thold;
547
- float no_speech_thold; // TODO: not implemented
547
+ float no_speech_thold;
548
548
 
549
549
  struct {
550
550
  int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
@@ -580,11 +580,18 @@ extern "C" {
580
580
  size_t n_grammar_rules;
581
581
  size_t i_start_rule;
582
582
  float grammar_penalty;
583
+
584
+ // Voice Activity Detection (VAD) params
585
+ bool vad; // Enable VAD
586
+ const char * vad_model_path; // Path to VAD model
587
+
588
+ whisper_vad_params vad_params;
583
589
  };
584
590
 
585
591
  // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
586
592
  WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(void);
587
593
  WHISPER_API struct whisper_context_params whisper_context_default_params (void);
594
+
588
595
  WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
589
596
  WHISPER_API struct whisper_full_params whisper_full_default_params (enum whisper_sampling_strategy strategy);
590
597
 
@@ -662,6 +669,53 @@ extern "C" {
662
669
  WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
663
670
  WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
664
671
 
672
+ //
673
+ // Voice Activity Detection (VAD)
674
+ //
675
+
676
+ struct whisper_vad_context;
677
+
678
+ WHISPER_API struct whisper_vad_params whisper_vad_default_params(void);
679
+
680
+ struct whisper_vad_context_params {
681
+ int n_threads; // The number of threads to use for processing.
682
+ bool use_gpu;
683
+ int gpu_device; // CUDA device
684
+ };
685
+
686
+ WHISPER_API struct whisper_vad_context_params whisper_vad_default_context_params(void);
687
+
688
+ WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params(const char * path_model, struct whisper_vad_context_params params);
689
+ WHISPER_API struct whisper_vad_context * whisper_vad_init_with_params (struct whisper_model_loader * loader, struct whisper_vad_context_params params);
690
+
691
+ WHISPER_API bool whisper_vad_detect_speech(
692
+ struct whisper_vad_context * vctx,
693
+ const float * samples,
694
+ int n_samples);
695
+
696
+ WHISPER_API int whisper_vad_n_probs(struct whisper_vad_context * vctx);
697
+ WHISPER_API float * whisper_vad_probs (struct whisper_vad_context * vctx);
698
+
699
+ struct whisper_vad_segments;
700
+
701
+ WHISPER_API struct whisper_vad_segments * whisper_vad_segments_from_probs(
702
+ struct whisper_vad_context * vctx,
703
+ struct whisper_vad_params params);
704
+
705
+ WHISPER_API struct whisper_vad_segments * whisper_vad_segments_from_samples(
706
+ struct whisper_vad_context * vctx,
707
+ struct whisper_vad_params params,
708
+ const float * samples,
709
+ int n_samples);
710
+
711
+ WHISPER_API int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments);
712
+
713
+ WHISPER_API float whisper_vad_segments_get_segment_t0(struct whisper_vad_segments * segments, int i_segment);
714
+ WHISPER_API float whisper_vad_segments_get_segment_t1(struct whisper_vad_segments * segments, int i_segment);
715
+
716
+ WHISPER_API void whisper_vad_free_segments(struct whisper_vad_segments * segments);
717
+ WHISPER_API void whisper_vad_free (struct whisper_vad_context * ctx);
718
+
665
719
  ////////////////////////////////////////////////////////////////////////////
666
720
 
667
721
  // Temporary helpers needed for exposing ggml interface
@@ -675,6 +729,9 @@ extern "C" {
675
729
 
676
730
  WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
677
731
 
732
+ // Get the no_speech probability for the specified segment
733
+ WHISPER_API float whisper_full_get_segment_no_speech_prob (struct whisper_context * ctx, int i_segment);
734
+ WHISPER_API float whisper_full_get_segment_no_speech_prob_from_state(struct whisper_state * state, int i_segment);
678
735
  #ifdef __cplusplus
679
736
  }
680
737
  #endif
@@ -0,0 +1,95 @@
1
+ cmake_minimum_required(VERSION 3.16)
2
+ project(rnwhisper VERSION 1.0.0 LANGUAGES CXX C)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+
7
+ # iOS specific settings
8
+ set(CMAKE_OSX_DEPLOYMENT_TARGET 13.0)
9
+ set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE NO)
10
+
11
+ # Dependencies and compile options
12
+ add_definitions(
13
+ -DNDEBUG
14
+ -DO3
15
+ -DWSP_GGML_USE_CPU
16
+ -DWSP_GGML_USE_ACCELERATE
17
+ -DWSP_GGML_USE_METAL
18
+ -DWSP_GGML_METAL_USE_BF16
19
+ )
20
+
21
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64;x86_64")
22
+ add_definitions(-DWSP_GGML_CPU_GENERIC)
23
+ endif ()
24
+
25
+ set(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../cpp)
26
+
27
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")
28
+ set(SOURCE_FILES_ARCH
29
+ ${SOURCE_DIR}/ggml-cpu/arch/arm/quants.c
30
+ ${SOURCE_DIR}/ggml-cpu/arch/arm/repack.cpp
31
+ )
32
+ endif ()
33
+
34
+ # Define public headers
35
+ set(PUBLIC_HEADERS
36
+ ${SOURCE_DIR}/rn-whisper.h
37
+ ${SOURCE_DIR}/whisper.h
38
+ ${SOURCE_DIR}/ggml.h
39
+ )
40
+
41
+ # Create library target
42
+ add_library(rnwhisper SHARED
43
+ ${SOURCE_DIR}/ggml.c
44
+ ${SOURCE_DIR}/ggml-alloc.c
45
+ ${SOURCE_DIR}/ggml-backend.cpp
46
+ ${SOURCE_DIR}/ggml-backend-reg.cpp
47
+ ${SOURCE_DIR}/ggml-cpu/amx/amx.cpp
48
+ ${SOURCE_DIR}/ggml-cpu/amx/mmq.cpp
49
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu.c
50
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu.cpp
51
+ ${SOURCE_DIR}/ggml-cpu/quants.c
52
+ ${SOURCE_DIR}/ggml-cpu/traits.cpp
53
+ ${SOURCE_DIR}/ggml-cpu/repack.cpp
54
+ ${SOURCE_DIR}/ggml-cpu/unary-ops.cpp
55
+ ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
56
+ ${SOURCE_DIR}/ggml-cpu/vec.cpp
57
+ ${SOURCE_DIR}/ggml-cpu/ops.cpp
58
+ ${SOURCE_DIR}/ggml-metal.m
59
+ ${SOURCE_DIR}/ggml-opt.cpp
60
+ ${SOURCE_DIR}/ggml-threading.cpp
61
+ ${SOURCE_DIR}/ggml-quants.c
62
+ ${SOURCE_DIR}/gguf.cpp
63
+ ${SOURCE_DIR}/whisper.cpp
64
+ ${SOURCE_DIR}/rn-whisper.cpp
65
+ ${SOURCE_DIR}/rn-audioutils.cpp
66
+ ${SOURCE_FILES_ARCH}
67
+ )
68
+
69
+ # Setup include directories
70
+ target_include_directories(rnwhisper
71
+ PUBLIC
72
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
73
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/ggml-cpu>
74
+ $<INSTALL_INTERFACE:include>
75
+ )
76
+
77
+ # Link required frameworks
78
+ target_link_libraries(rnwhisper PRIVATE
79
+ "-framework Accelerate"
80
+ "-framework Foundation"
81
+ "-framework Metal"
82
+ "-framework MetalKit"
83
+ )
84
+
85
+ # Set properties for framework
86
+ set_target_properties(rnwhisper PROPERTIES
87
+ MACOSX_FRAMEWORK_IDENTIFIER "com.rnwhisper"
88
+ MACOSX_FRAMEWORK_BUNDLE_VERSION 1.0.0
89
+ MACOSX_FRAMEWORK_SHORT_VERSION_STRING 1.0.0
90
+ FRAMEWORK TRUE
91
+ FRAMEWORK_VERSION 1.0.0
92
+ VERSION 1.0.0
93
+ PUBLIC_HEADER "${PUBLIC_HEADERS}"
94
+ XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC NO
95
+ )
package/ios/RNWhisper.h CHANGED
@@ -1,6 +1,11 @@
1
1
  #ifdef __cplusplus
2
+ #if RNWHISPER_BUILD_FROM_SOURCE
2
3
  #import "whisper.h"
3
4
  #import "rn-whisper.h"
5
+ #else
6
+ #import <rnwhisper/whisper.h>
7
+ #import <rnwhisper/rn-whisper.h>
8
+ #endif
4
9
  #endif
5
10
 
6
11
  #import <React/RCTBridgeModule.h>
package/ios/RNWhisper.mm CHANGED
@@ -1,5 +1,6 @@
1
1
  #import "RNWhisper.h"
2
2
  #import "RNWhisperContext.h"
3
+ #import "RNWhisperVadContext.h"
3
4
  #import "RNWhisperDownloader.h"
4
5
  #import "RNWhisperAudioUtils.h"
5
6
  #import "RNWhisperAudioSessionUtils.h"
@@ -13,6 +14,7 @@
13
14
  @implementation RNWhisper
14
15
 
15
16
  NSMutableDictionary *contexts;
17
+ NSMutableDictionary *vadContexts;
16
18
 
17
19
  RCT_EXPORT_MODULE()
18
20
 
@@ -366,6 +368,15 @@ RCT_REMAP_METHOD(releaseAllContexts,
366
368
  [context invalidate];
367
369
  }
368
370
 
371
+ if (vadContexts != nil) {
372
+ for (NSNumber *contextId in vadContexts) {
373
+ RNWhisperVadContext *vadContext = vadContexts[contextId];
374
+ [vadContext invalidate];
375
+ }
376
+ [vadContexts removeAllObjects];
377
+ vadContexts = nil;
378
+ }
379
+
369
380
  rnwhisper::job_abort_all(); // graceful abort
370
381
 
371
382
  [contexts removeAllObjects];
@@ -437,6 +448,142 @@ RCT_REMAP_METHOD(setAudioSessionActive,
437
448
  resolve(nil);
438
449
  }
439
450
 
451
+ RCT_REMAP_METHOD(initVadContext,
452
+ withVadOptions:(NSDictionary *)vadOptions
453
+ withResolver:(RCTPromiseResolveBlock)resolve
454
+ withRejecter:(RCTPromiseRejectBlock)reject)
455
+ {
456
+ if (vadContexts == nil) {
457
+ vadContexts = [[NSMutableDictionary alloc] init];
458
+ }
459
+
460
+ NSString *modelPath = [vadOptions objectForKey:@"filePath"];
461
+ BOOL isBundleAsset = [[vadOptions objectForKey:@"isBundleAsset"] boolValue];
462
+ BOOL useGpu = [[vadOptions objectForKey:@"useGpu"] boolValue];
463
+ NSNumber *nThreads = [vadOptions objectForKey:@"nThreads"];
464
+
465
+ NSString *path = modelPath;
466
+ if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
467
+ path = [RNWhisperDownloader downloadFile:path toFile:nil];
468
+ }
469
+ if (isBundleAsset) {
470
+ path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
471
+ }
472
+
473
+ int contextId = arc4random_uniform(1000000);
474
+
475
+ RNWhisperVadContext *vadContext = [RNWhisperVadContext
476
+ initWithModelPath:path
477
+ contextId:contextId
478
+ noMetal:!useGpu
479
+ nThreads:nThreads
480
+ ];
481
+ if ([vadContext getVadContext] == NULL) {
482
+ reject(@"whisper_vad_error", @"Failed to load the VAD model", nil);
483
+ return;
484
+ }
485
+
486
+ [vadContexts setObject:vadContext forKey:[NSNumber numberWithInt:contextId]];
487
+
488
+ resolve(@{
489
+ @"contextId": @(contextId),
490
+ @"gpu": @([vadContext isMetalEnabled]),
491
+ @"reasonNoGPU": [vadContext reasonNoMetal],
492
+ });
493
+ }
494
+
495
+ RCT_REMAP_METHOD(vadDetectSpeech,
496
+ withContextId:(int)contextId
497
+ withAudioData:(NSString *)audioDataBase64
498
+ withOptions:(NSDictionary *)options
499
+ withResolver:(RCTPromiseResolveBlock)resolve
500
+ withRejecter:(RCTPromiseRejectBlock)reject)
501
+ {
502
+ RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
503
+
504
+ if (vadContext == nil) {
505
+ reject(@"whisper_vad_error", @"VAD context not found", nil);
506
+ return;
507
+ }
508
+
509
+ // Decode base64 audio data
510
+ NSData *audioData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
511
+ if (audioData == nil) {
512
+ reject(@"whisper_vad_error", @"Invalid audio data", nil);
513
+ return;
514
+ }
515
+
516
+ NSArray *segments = [vadContext detectSpeech:audioData options:options];
517
+ resolve(segments);
518
+ }
519
+
520
+ RCT_REMAP_METHOD(vadDetectSpeechFile,
521
+ withVadContextId:(int)contextId
522
+ withFilePath:(NSString *)filePath
523
+ withOptions:(NSDictionary *)options
524
+ withResolver:(RCTPromiseResolveBlock)resolve
525
+ withRejecter:(RCTPromiseRejectBlock)reject)
526
+ {
527
+ RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
528
+
529
+ if (vadContext == nil) {
530
+ reject(@"whisper_vad_error", @"VAD context not found", nil);
531
+ return;
532
+ }
533
+
534
+ // Handle different input types like transcribeFile does
535
+ float *data = nil;
536
+ int count = 0;
537
+ if ([filePath hasPrefix:@"http://"] || [filePath hasPrefix:@"https://"]) {
538
+ NSString *path = [RNWhisperDownloader downloadFile:filePath toFile:nil];
539
+ data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
540
+ } else if ([filePath hasPrefix:@"data:audio/wav;base64,"]) {
541
+ NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[filePath substringFromIndex:22] options:0];
542
+ data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
543
+ } else {
544
+ data = [RNWhisperAudioUtils decodeWaveFile:filePath count:&count];
545
+ }
546
+
547
+ if (data == nil) {
548
+ reject(@"whisper_vad_error", @"Failed to load or decode audio file", nil);
549
+ return;
550
+ }
551
+
552
+ // Convert float32 data to NSData for VAD context
553
+ NSData *audioData = [NSData dataWithBytes:data length:count * sizeof(float)];
554
+
555
+ NSArray *segments = [vadContext detectSpeech:audioData options:options];
556
+ resolve(segments);
557
+ }
558
+
559
+ RCT_REMAP_METHOD(releaseVadContext,
560
+ withVadContextId:(int)contextId
561
+ withResolver:(RCTPromiseResolveBlock)resolve
562
+ withRejecter:(RCTPromiseRejectBlock)reject)
563
+ {
564
+ RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
565
+ if (vadContext == nil) {
566
+ reject(@"whisper_vad_error", @"VAD context not found", nil);
567
+ return;
568
+ }
569
+ [vadContext invalidate];
570
+ [vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
571
+ resolve(nil);
572
+ }
573
+
574
+ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
575
+ withRejecter:(RCTPromiseRejectBlock)reject)
576
+ {
577
+ if (vadContexts != nil) {
578
+ for (NSNumber *contextId in vadContexts) {
579
+ RNWhisperVadContext *vadContext = vadContexts[contextId];
580
+ [vadContext invalidate];
581
+ }
582
+ [vadContexts removeAllObjects];
583
+ }
584
+ resolve(nil);
585
+ }
586
+
440
587
  #ifdef RCT_NEW_ARCH_ENABLED
441
588
  - (std::shared_ptr<facebook::react::TurboModule>)getTurboModule:
442
589
  (const facebook::react::ObjCTurboModule::InitParams &)params
@@ -1,5 +1,9 @@
1
1
  #import "RNWhisperAudioUtils.h"
2
+ #if RNWHISPER_BUILD_FROM_SOURCE
2
3
  #import "whisper.h"
4
+ #else
5
+ #import <rnwhisper/whisper.h>
6
+ #endif
3
7
 
4
8
  @implementation RNWhisperAudioUtils
5
9
 
@@ -1,6 +1,11 @@
1
1
  #ifdef __cplusplus
2
+ #if RNWHISPER_BUILD_FROM_SOURCE
2
3
  #import "whisper.h"
3
4
  #import "rn-whisper.h"
5
+ #else
6
+ #import <rnwhisper/whisper.h>
7
+ #import <rnwhisper/rn-whisper.h>
8
+ #endif
4
9
  #endif
5
10
 
6
11
  #import <AVFoundation/AVFoundation.h>
@@ -19,8 +19,9 @@
19
19
  cparams.use_gpu = !noMetal;
20
20
  cparams.flash_attn = useFlashAttn;
21
21
 
22
- // TODO: Figure out why it leads to re-init crash
22
+ // TODO: Expose dtw_token_timestamps and dtw_aheads_preset
23
23
  cparams.dtw_token_timestamps = false;
24
+ // cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
24
25
 
25
26
  cparams.use_coreml = !noCoreML;
26
27
  #ifndef WHISPER_USE_COREML
@@ -35,36 +36,30 @@
35
36
  NSLog(@"[RNWhisper] ggml-metal is not enabled in this build, ignoring use_gpu option");
36
37
  cparams.use_gpu = false;
37
38
  }
39
+ reasonNoMetal = @"Metal is not enabled in this build";
38
40
  #endif
39
41
 
40
42
  #ifdef WSP_GGML_USE_METAL
41
43
  if (cparams.use_gpu) {
42
- #if TARGET_OS_SIMULATOR
43
- NSLog(@"[RNWhisper] ggml-metal is not available in simulator, ignoring use_gpu option: %@", reasonNoMetal);
44
- cparams.use_gpu = false;
45
- #else // TARGET_OS_SIMULATOR
46
- // Check ggml-metal availability
47
- NSError * error = nil;
48
44
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
49
- id<MTLLibrary> library = [device
50
- newLibraryWithSource:@"#include <metal_stdlib>\n"
51
- "using namespace metal;"
52
- "kernel void test() { simd_sum(0); }"
53
- options:nil
54
- error:&error
55
- ];
56
- if (error) {
57
- reasonNoMetal = [error localizedDescription];
58
- } else {
59
- id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
60
- id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
61
- if (pipeline == nil) {
62
- reasonNoMetal = [error localizedDescription];
63
- NSLog(@"[RNWhisper] ggml-metal is not available, ignoring use_gpu option: %@", reasonNoMetal);
64
- cparams.use_gpu = false;
65
- }
45
+
46
+ // Check ggml-metal availability
47
+ BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
48
+ if (@available(iOS 16.0, tvOS 16.0, *)) {
49
+ supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
66
50
  }
67
- #endif // TARGET_OS_SIMULATOR
51
+ if (!supportsGgmlMetal) {
52
+ cparams.use_gpu = false;
53
+ reasonNoMetal = @"Metal is not supported in this device";
54
+ }
55
+
56
+ #if TARGET_OS_SIMULATOR
57
+ // Use the backend, but no layers because not supported fully on simulator
58
+ cparams.use_gpu = false;
59
+ reasonNoMetal = @"Metal is not supported in simulator";
60
+ #endif
61
+
62
+ device = nil;
68
63
  }
69
64
  #endif // WSP_GGML_USE_METAL
70
65
 
@@ -431,6 +426,7 @@ struct rnwhisper_segments_callback_data {
431
426
  self->recordState.job = job;
432
427
  int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
433
428
  rnwhisper::job_remove(jobId);
429
+ self->recordState.job = nullptr;
434
430
  self->recordState.isTranscribing = false;
435
431
  onEnd(code);
436
432
  });
@@ -445,7 +441,7 @@ struct rnwhisper_segments_callback_data {
445
441
  }
446
442
 
447
443
  - (void)stopTranscribe:(int)jobId {
448
- if (self->recordState.job) self->recordState.job->abort();
444
+ if (self->recordState.job != nullptr) self->recordState.job->abort();
449
445
  if (self->recordState.isRealtime && self->recordState.isCapturing) {
450
446
  [self stopAudio];
451
447
  if (!self->recordState.isTranscribing) {
@@ -0,0 +1,29 @@
1
+ #ifdef __cplusplus
2
+ #if RNWHISPER_BUILD_FROM_SOURCE
3
+ #import "whisper.h"
4
+ #import "rn-whisper.h"
5
+ #else
6
+ #import <rnwhisper/whisper.h>
7
+ #import <rnwhisper/rn-whisper.h>
8
+ #endif
9
+ #endif
10
+
11
+ #import <Foundation/Foundation.h>
12
+
13
+ @interface RNWhisperVadContext : NSObject {
14
+ int contextId;
15
+ dispatch_queue_t dQueue;
16
+ struct whisper_vad_context * vctx;
17
+ NSString * reasonNoMetal;
18
+ bool isMetalEnabled;
19
+ }
20
+
21
+ + (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noMetal:(BOOL)noMetal nThreads:(NSNumber *)nThreads;
22
+ - (bool)isMetalEnabled;
23
+ - (NSString *)reasonNoMetal;
24
+ - (struct whisper_vad_context *)getVadContext;
25
+ - (dispatch_queue_t)getDispatchQueue;
26
+ - (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options;
27
+ - (void)invalidate;
28
+
29
+ @end