whisper.rn 0.4.0-rc.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/README.md +5 -1
  2. package/android/build.gradle +12 -3
  3. package/android/src/main/CMakeLists.txt +44 -13
  4. package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -12
  5. package/android/src/main/java/com/rnwhisper/RNWhisper.java +75 -34
  6. package/android/src/main/java/com/rnwhisper/WhisperContext.java +53 -38
  7. package/android/src/main/jni.cpp +38 -1
  8. package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  15. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  16. package/cpp/coreml/whisper-compat.h +10 -0
  17. package/cpp/coreml/whisper-compat.m +35 -0
  18. package/cpp/coreml/whisper-decoder-impl.h +27 -15
  19. package/cpp/coreml/whisper-decoder-impl.m +36 -10
  20. package/cpp/coreml/whisper-encoder-impl.h +21 -9
  21. package/cpp/coreml/whisper-encoder-impl.m +29 -3
  22. package/cpp/ggml-alloc.c +727 -517
  23. package/cpp/ggml-alloc.h +47 -65
  24. package/cpp/ggml-backend-impl.h +196 -57
  25. package/cpp/ggml-backend-reg.cpp +591 -0
  26. package/cpp/ggml-backend.cpp +2016 -0
  27. package/cpp/ggml-backend.h +234 -89
  28. package/cpp/ggml-common.h +1861 -0
  29. package/cpp/ggml-cpp.h +39 -0
  30. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  31. package/cpp/ggml-cpu/amx/amx.h +8 -0
  32. package/cpp/ggml-cpu/amx/common.h +91 -0
  33. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  34. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  35. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  36. package/cpp/ggml-cpu/arch/arm/quants.c +4113 -0
  37. package/cpp/ggml-cpu/arch/arm/repack.cpp +2162 -0
  38. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  39. package/cpp/ggml-cpu/arch/x86/quants.c +4310 -0
  40. package/cpp/ggml-cpu/arch/x86/repack.cpp +3284 -0
  41. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  42. package/cpp/ggml-cpu/binary-ops.cpp +158 -0
  43. package/cpp/ggml-cpu/binary-ops.h +16 -0
  44. package/cpp/ggml-cpu/common.h +72 -0
  45. package/cpp/ggml-cpu/ggml-cpu-impl.h +511 -0
  46. package/cpp/ggml-cpu/ggml-cpu.c +3473 -0
  47. package/cpp/ggml-cpu/ggml-cpu.cpp +671 -0
  48. package/cpp/ggml-cpu/ops.cpp +9085 -0
  49. package/cpp/ggml-cpu/ops.h +111 -0
  50. package/cpp/ggml-cpu/quants.c +1157 -0
  51. package/cpp/ggml-cpu/quants.h +89 -0
  52. package/cpp/ggml-cpu/repack.cpp +1570 -0
  53. package/cpp/ggml-cpu/repack.h +98 -0
  54. package/cpp/ggml-cpu/simd-mappings.h +1006 -0
  55. package/cpp/ggml-cpu/traits.cpp +36 -0
  56. package/cpp/ggml-cpu/traits.h +38 -0
  57. package/cpp/ggml-cpu/unary-ops.cpp +186 -0
  58. package/cpp/ggml-cpu/unary-ops.h +28 -0
  59. package/cpp/ggml-cpu/vec.cpp +321 -0
  60. package/cpp/ggml-cpu/vec.h +973 -0
  61. package/cpp/ggml-cpu.h +143 -0
  62. package/cpp/ggml-impl.h +525 -168
  63. package/cpp/ggml-metal-impl.h +622 -0
  64. package/cpp/ggml-metal.h +16 -14
  65. package/cpp/ggml-metal.m +5289 -1859
  66. package/cpp/ggml-opt.cpp +1037 -0
  67. package/cpp/ggml-opt.h +237 -0
  68. package/cpp/ggml-quants.c +2916 -6877
  69. package/cpp/ggml-quants.h +87 -249
  70. package/cpp/ggml-threading.cpp +12 -0
  71. package/cpp/ggml-threading.h +14 -0
  72. package/cpp/ggml-whisper-sim.metallib +0 -0
  73. package/cpp/ggml-whisper.metallib +0 -0
  74. package/cpp/ggml.c +3293 -16770
  75. package/cpp/ggml.h +778 -835
  76. package/cpp/gguf.cpp +1347 -0
  77. package/cpp/gguf.h +202 -0
  78. package/cpp/rn-whisper.cpp +84 -0
  79. package/cpp/rn-whisper.h +2 -0
  80. package/cpp/whisper-arch.h +197 -0
  81. package/cpp/whisper.cpp +3240 -944
  82. package/cpp/whisper.h +144 -31
  83. package/ios/CMakeLists.txt +95 -0
  84. package/ios/RNWhisper.h +5 -0
  85. package/ios/RNWhisper.mm +124 -37
  86. package/ios/RNWhisperAudioUtils.h +1 -0
  87. package/ios/RNWhisperAudioUtils.m +24 -13
  88. package/ios/RNWhisperContext.h +8 -2
  89. package/ios/RNWhisperContext.mm +42 -8
  90. package/ios/rnwhisper.xcframework/Info.plist +74 -0
  91. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  92. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  93. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  94. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  95. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  96. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  97. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  98. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  99. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  100. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  101. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  102. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  103. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
  104. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
  105. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  106. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  107. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  108. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  109. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
  110. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  111. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  112. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  113. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  114. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  115. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  116. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  117. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  118. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  119. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  120. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  121. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  122. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  123. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  124. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  125. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
  126. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
  127. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  128. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  129. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  130. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  131. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
  132. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  133. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
  134. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  135. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  136. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  137. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  138. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  139. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  140. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  141. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  142. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  143. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  144. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  145. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  146. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  147. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  148. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
  149. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
  150. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  151. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  152. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  153. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  154. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
  155. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  156. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  157. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  158. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  159. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  160. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  161. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  162. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  163. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  164. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  165. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  166. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  167. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  168. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  169. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  170. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
  171. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
  172. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  173. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  174. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  175. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  176. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
  177. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  178. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
  179. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  180. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  181. package/jest/mock.js +14 -1
  182. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  183. package/lib/commonjs/index.js +48 -19
  184. package/lib/commonjs/index.js.map +1 -1
  185. package/lib/commonjs/version.json +1 -1
  186. package/lib/module/NativeRNWhisper.js.map +1 -1
  187. package/lib/module/index.js +48 -19
  188. package/lib/module/index.js.map +1 -1
  189. package/lib/module/version.json +1 -1
  190. package/lib/typescript/NativeRNWhisper.d.ts +6 -3
  191. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  192. package/lib/typescript/index.d.ts +25 -3
  193. package/lib/typescript/index.d.ts.map +1 -1
  194. package/package.json +15 -10
  195. package/src/NativeRNWhisper.ts +12 -3
  196. package/src/index.ts +63 -24
  197. package/src/version.json +1 -1
  198. package/whisper-rn.podspec +18 -18
  199. package/cpp/README.md +0 -4
  200. package/cpp/ggml-backend.c +0 -1718
  201. package/cpp/ggml-metal-whisper.metal +0 -5820
@@ -0,0 +1,739 @@
1
+ #ifndef WHISPER_H
2
+ #define WHISPER_H
3
+
4
+ #include "ggml.h"
5
+ #include "ggml-cpu.h"
6
+
7
+ #include <stddef.h>
8
+ #include <stdint.h>
9
+ #include <stdbool.h>
10
+
11
+ #ifdef __GNUC__
12
+ # define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
13
+ #elif defined(_MSC_VER)
14
+ # define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
15
+ #else
16
+ # define WHISPER_DEPRECATED(func, hint) func
17
+ #endif
18
+
19
+ #ifdef WHISPER_SHARED
20
+ # ifdef _WIN32
21
+ # ifdef WHISPER_BUILD
22
+ # define WHISPER_API __declspec(dllexport)
23
+ # else
24
+ # define WHISPER_API __declspec(dllimport)
25
+ # endif
26
+ # else
27
+ # define WHISPER_API __attribute__ ((visibility ("default")))
28
+ # endif
29
+ #else
30
+ # define WHISPER_API
31
+ #endif
32
+
33
+ #define WHISPER_SAMPLE_RATE 16000
34
+ #define WHISPER_N_FFT 400
35
+ #define WHISPER_HOP_LENGTH 160
36
+ #define WHISPER_CHUNK_SIZE 30
37
+
38
+ #ifdef __cplusplus
39
+ extern "C" {
40
+ #endif
41
+
42
+ //
43
+ // C interface
44
+ //
45
+ // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
46
+ // concurrently.
47
+ //
48
+ // Basic usage:
49
+ //
50
+ // #include "whisper.h"
51
+ //
52
+ // ...
53
+ //
54
+ // whisper_context_params cparams = whisper_context_default_params();
55
+ //
56
+ // struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
57
+ //
58
+ // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
59
+ // fprintf(stderr, "failed to process audio\n");
60
+ // return 7;
61
+ // }
62
+ //
63
+ // const int n_segments = whisper_full_n_segments(ctx);
64
+ // for (int i = 0; i < n_segments; ++i) {
65
+ // const char * text = whisper_full_get_segment_text(ctx, i);
66
+ // printf("%s", text);
67
+ // }
68
+ //
69
+ // whisper_free(ctx);
70
+ //
71
+ // ...
72
+ //
73
+ // This is a demonstration of the most straightforward usage of the library.
74
+ // "pcmf32" contains the RAW audio data in 32-bit floating point format.
75
+ //
76
+ // The interface also allows for more fine-grained control over the computation, but it requires a deeper
77
+ // understanding of how the model works.
78
+ //
79
+
80
+ struct whisper_context;
81
+ struct whisper_state;
82
+ struct whisper_full_params;
83
+
84
+ typedef int32_t whisper_pos;
85
+ typedef int32_t whisper_token;
86
+ typedef int32_t whisper_seq_id;
87
+
88
+ enum whisper_alignment_heads_preset {
89
+ WHISPER_AHEADS_NONE,
90
+ WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers
91
+ WHISPER_AHEADS_CUSTOM,
92
+ WHISPER_AHEADS_TINY_EN,
93
+ WHISPER_AHEADS_TINY,
94
+ WHISPER_AHEADS_BASE_EN,
95
+ WHISPER_AHEADS_BASE,
96
+ WHISPER_AHEADS_SMALL_EN,
97
+ WHISPER_AHEADS_SMALL,
98
+ WHISPER_AHEADS_MEDIUM_EN,
99
+ WHISPER_AHEADS_MEDIUM,
100
+ WHISPER_AHEADS_LARGE_V1,
101
+ WHISPER_AHEADS_LARGE_V2,
102
+ WHISPER_AHEADS_LARGE_V3,
103
+ WHISPER_AHEADS_LARGE_V3_TURBO,
104
+ };
105
+
106
+ typedef struct whisper_ahead {
107
+ int n_text_layer;
108
+ int n_head;
109
+ } whisper_ahead;
110
+
111
+ typedef struct whisper_aheads {
112
+ size_t n_heads;
113
+ const whisper_ahead * heads;
114
+ } whisper_aheads;
115
+
116
+ struct whisper_context_params {
117
+ bool use_gpu;
118
+ bool use_coreml;
119
+ bool flash_attn;
120
+ int gpu_device; // CUDA device
121
+
122
+ // [EXPERIMENTAL] Token-level timestamps with DTW
123
+ bool dtw_token_timestamps;
124
+ enum whisper_alignment_heads_preset dtw_aheads_preset;
125
+
126
+ int dtw_n_top;
127
+ struct whisper_aheads dtw_aheads;
128
+
129
+ size_t dtw_mem_size; // TODO: remove
130
+ };
131
+
132
+ typedef struct whisper_token_data {
133
+ whisper_token id; // token id
134
+ whisper_token tid; // forced timestamp token id
135
+
136
+ float p; // probability of the token
137
+ float plog; // log probability of the token
138
+ float pt; // probability of the timestamp token
139
+ float ptsum; // sum of probabilities of all timestamp tokens
140
+
141
+ // token-level timestamp data
142
+ // do not use if you haven't computed token-level timestamps
143
+ int64_t t0; // start time of the token
144
+ int64_t t1; // end time of the token
145
+
146
+ // [EXPERIMENTAL] Token-level timestamps with DTW
147
+ // do not use if you haven't computed token-level timestamps with dtw
148
+ // Roughly corresponds to the moment in audio in which the token was output
149
+ int64_t t_dtw;
150
+
151
+ float vlen; // voice length of the token
152
+ } whisper_token_data;
153
+
154
+ typedef struct whisper_model_loader {
155
+ void * context;
156
+
157
+ size_t (*read)(void * ctx, void * output, size_t read_size);
158
+ bool (*eof)(void * ctx);
159
+ void (*close)(void * ctx);
160
+ } whisper_model_loader;
161
+
162
+ // grammar element type
163
+ enum whisper_gretype {
164
+ // end of rule definition
165
+ WHISPER_GRETYPE_END = 0,
166
+
167
+ // start of alternate definition for rule
168
+ WHISPER_GRETYPE_ALT = 1,
169
+
170
+ // non-terminal element: reference to rule
171
+ WHISPER_GRETYPE_RULE_REF = 2,
172
+
173
+ // terminal element: character (code point)
174
+ WHISPER_GRETYPE_CHAR = 3,
175
+
176
+ // inverse char(s) ([^a], [^a-b] [^abc])
177
+ WHISPER_GRETYPE_CHAR_NOT = 4,
178
+
179
+ // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
180
+ // be an inclusive range ([a-z])
181
+ WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
182
+
183
+ // modifies a preceding WHISPER_GRETYPE_CHAR or
184
+ // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
185
+ WHISPER_GRETYPE_CHAR_ALT = 6,
186
+ };
187
+
188
+ typedef struct whisper_grammar_element {
189
+ enum whisper_gretype type;
190
+ uint32_t value; // Unicode code point or rule ID
191
+ } whisper_grammar_element;
192
+
193
+ typedef struct whisper_vad_params {
194
+ float threshold; // Probability threshold to consider as speech.
195
+ int min_speech_duration_ms; // Min duration for a valid speech segment.
196
+ int min_silence_duration_ms; // Min silence duration to consider speech as ended.
197
+ float max_speech_duration_s; // Max duration of a speech segment before forcing a new segment.
198
+ int speech_pad_ms; // Padding added before and after speech segments.
199
+ float samples_overlap; // Overlap in seconds when copying audio samples from speech segment.
200
+ } whisper_vad_params;
201
+
202
+ // Various functions for loading a ggml whisper model.
203
+ // Allocate (almost) all memory needed for the model.
204
+ // Return NULL on failure
205
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
206
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
207
+ WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
208
+
209
+ // These are the same as the above, but the internal state of the context is not allocated automatically
210
+ // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
211
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
212
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
213
+ WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
214
+
215
+ WHISPER_DEPRECATED(
216
+ WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
217
+ "use whisper_init_from_file_with_params instead"
218
+ );
219
+ WHISPER_DEPRECATED(
220
+ WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
221
+ "use whisper_init_from_buffer_with_params instead"
222
+ );
223
+ WHISPER_DEPRECATED(
224
+ WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
225
+ "use whisper_init_with_params instead"
226
+ );
227
+ WHISPER_DEPRECATED(
228
+ WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
229
+ "use whisper_init_from_file_with_params_no_state instead"
230
+ );
231
+ WHISPER_DEPRECATED(
232
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
233
+ "use whisper_init_from_buffer_with_params_no_state instead"
234
+ );
235
+ WHISPER_DEPRECATED(
236
+ WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
237
+ "use whisper_init_with_params_no_state instead"
238
+ );
239
+
240
+ WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
241
+
242
+ // Given a context, enable use of OpenVINO for encode inference.
243
+ // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
244
+ // the path will be generated from the ggml model path that was passed
245
+ // in to whisper_init_from_file. For example, if 'path_model' was
246
+ // "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
247
+ // assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
248
+ // device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
249
+ // cache_dir: Optional cache directory that can speed up init time, especially for
250
+ // GPU, by caching compiled 'blobs' there.
251
+ // Set to nullptr if not used.
252
+ // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
253
+ WHISPER_API int whisper_ctx_init_openvino_encoder_with_state(
254
+ struct whisper_context * ctx,
255
+ struct whisper_state * state,
256
+ const char * model_path,
257
+ const char * device,
258
+ const char * cache_dir);
259
+
260
+ WHISPER_API int whisper_ctx_init_openvino_encoder(
261
+ struct whisper_context * ctx,
262
+ const char * model_path,
263
+ const char * device,
264
+ const char * cache_dir);
265
+
266
+ // Frees all allocated memory
267
+ WHISPER_API void whisper_free (struct whisper_context * ctx);
268
+ WHISPER_API void whisper_free_state(struct whisper_state * state);
269
+ WHISPER_API void whisper_free_params(struct whisper_full_params * params);
270
+ WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
271
+
272
+ // Convert RAW PCM audio to log mel spectrogram.
273
+ // The resulting spectrogram is stored inside the default state of the provided whisper context.
274
+ // Returns 0 on success
275
+ WHISPER_API int whisper_pcm_to_mel(
276
+ struct whisper_context * ctx,
277
+ const float * samples,
278
+ int n_samples,
279
+ int n_threads);
280
+
281
+ WHISPER_API int whisper_pcm_to_mel_with_state(
282
+ struct whisper_context * ctx,
283
+ struct whisper_state * state,
284
+ const float * samples,
285
+ int n_samples,
286
+ int n_threads);
287
+
288
+ // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
289
+ // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
290
+ // n_mel must be 80
291
+ // Returns 0 on success
292
+ WHISPER_API int whisper_set_mel(
293
+ struct whisper_context * ctx,
294
+ const float * data,
295
+ int n_len,
296
+ int n_mel);
297
+
298
+ WHISPER_API int whisper_set_mel_with_state(
299
+ struct whisper_context * ctx,
300
+ struct whisper_state * state,
301
+ const float * data,
302
+ int n_len,
303
+ int n_mel);
304
+
305
+ // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
306
+ // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
307
+ // offset can be used to specify the offset of the first frame in the spectrogram.
308
+ // Returns 0 on success
309
+ WHISPER_API int whisper_encode(
310
+ struct whisper_context * ctx,
311
+ int offset,
312
+ int n_threads);
313
+
314
+ WHISPER_API int whisper_encode_with_state(
315
+ struct whisper_context * ctx,
316
+ struct whisper_state * state,
317
+ int offset,
318
+ int n_threads);
319
+
320
+ // Run the Whisper decoder to obtain the logits and probabilities for the next token.
321
+ // Make sure to call whisper_encode() first.
322
+ // tokens + n_tokens is the provided context for the decoder.
323
+ // n_past is the number of tokens to use from previous decoder calls.
324
+ // Returns 0 on success
325
+ // TODO: add support for multiple decoders
326
+ WHISPER_API int whisper_decode(
327
+ struct whisper_context * ctx,
328
+ const whisper_token * tokens,
329
+ int n_tokens,
330
+ int n_past,
331
+ int n_threads);
332
+
333
+ WHISPER_API int whisper_decode_with_state(
334
+ struct whisper_context * ctx,
335
+ struct whisper_state * state,
336
+ const whisper_token * tokens,
337
+ int n_tokens,
338
+ int n_past,
339
+ int n_threads);
340
+
341
+ // Convert the provided text into tokens.
342
+ // The tokens pointer must be large enough to hold the resulting tokens.
343
+ // Returns the number of tokens on success, no more than n_max_tokens
344
+ // Returns a negative number on failure - the number of tokens that would have been returned
345
+ // TODO: not sure if correct
346
+ WHISPER_API int whisper_tokenize(
347
+ struct whisper_context * ctx,
348
+ const char * text,
349
+ whisper_token * tokens,
350
+ int n_max_tokens);
351
+
352
+ // Return the number of tokens in the provided text
353
+ // Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
354
+ int whisper_token_count(struct whisper_context * ctx, const char * text);
355
+
356
+ // Largest language id (i.e. number of available languages - 1)
357
+ WHISPER_API int whisper_lang_max_id(void);
358
+
359
+ // Return the id of the specified language, returns -1 if not found
360
+ // Examples:
361
+ // "de" -> 2
362
+ // "german" -> 2
363
+ WHISPER_API int whisper_lang_id(const char * lang);
364
+
365
+ // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
366
+ WHISPER_API const char * whisper_lang_str(int id);
367
+
368
+ // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
369
+ WHISPER_API const char * whisper_lang_str_full(int id);
370
+
371
+ // Use mel data at offset_ms to try and auto-detect the spoken language
372
+ // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
373
+ // Returns the top language id or negative on failure
374
+ // If not null, fills the lang_probs array with the probabilities of all languages
375
+ // The array must be whisper_lang_max_id() + 1 in size
376
+ // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
377
+ WHISPER_API int whisper_lang_auto_detect(
378
+ struct whisper_context * ctx,
379
+ int offset_ms,
380
+ int n_threads,
381
+ float * lang_probs);
382
+
383
+ WHISPER_API int whisper_lang_auto_detect_with_state(
384
+ struct whisper_context * ctx,
385
+ struct whisper_state * state,
386
+ int offset_ms,
387
+ int n_threads,
388
+ float * lang_probs);
389
+
390
+ WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
391
+ WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
392
+ WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
393
+ WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
394
+ WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
395
+ WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
396
+
397
+ WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
398
+ WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
399
+ WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
400
+ WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
401
+ WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
402
+ WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
403
+ WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
404
+ WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
405
+ WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
406
+ WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
407
+ WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
408
+ WHISPER_API int whisper_model_type (struct whisper_context * ctx);
409
+
410
+ // Token logits obtained from the last call to whisper_decode()
411
+ // The logits for the last token are stored in the last row
412
+ // Rows: n_tokens
413
+ // Cols: n_vocab
414
+ WHISPER_API float * whisper_get_logits (struct whisper_context * ctx);
415
+ WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
416
+
417
+ // Token Id -> String. Uses the vocabulary in the provided context
418
+ WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
419
+ WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
420
+
421
+
422
+ // Special tokens
423
+ WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
424
+ WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
425
+ WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
426
+ WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
427
+ WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
428
+ WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
429
+ WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
430
+ WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
431
+
432
+ // Task tokens
433
+ WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
434
+ WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
435
+
436
+ // Performance information from the default state.
437
+ struct whisper_timings {
438
+ float sample_ms;
439
+ float encode_ms;
440
+ float decode_ms;
441
+ float batchd_ms;
442
+ float prompt_ms;
443
+ };
444
+ WHISPER_API struct whisper_timings * whisper_get_timings(struct whisper_context * ctx);
445
+ WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
446
+ WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
447
+
448
+ // Print system information
449
+ WHISPER_API const char * whisper_print_system_info(void);
450
+
451
+ ////////////////////////////////////////////////////////////////////////////
452
+
453
+ // Available sampling strategies
454
+ enum whisper_sampling_strategy {
455
+ WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder
456
+ WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
457
+ };
458
+
459
+ // Text segment callback
460
+ // Called on every newly generated text segment
461
+ // Use the whisper_full_...() functions to obtain the text segments
462
+ typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
463
+
464
+ // Progress callback
465
+ typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
466
+
467
+ // Encoder begin callback
468
+ // If not NULL, called before the encoder starts
469
+ // If it returns false, the computation is aborted
470
+ typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
471
+
472
+ // Logits filter callback
473
+ // Can be used to modify the logits before sampling
474
+ // If not NULL, called after applying temperature to logits
475
+ typedef void (*whisper_logits_filter_callback)(
476
+ struct whisper_context * ctx,
477
+ struct whisper_state * state,
478
+ const whisper_token_data * tokens,
479
+ int n_tokens,
480
+ float * logits,
481
+ void * user_data);
482
+
483
+ // Parameters for the whisper_full() function
484
+ // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
485
+ // whisper_full_default_params()
486
+ struct whisper_full_params {
487
+ enum whisper_sampling_strategy strategy;
488
+
489
+ int n_threads;
490
+ int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
491
+ int offset_ms; // start offset in ms
492
+ int duration_ms; // audio duration to process in ms
493
+
494
+ bool translate;
495
+ bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
496
+ bool no_timestamps; // do not generate timestamps
497
+ bool single_segment; // force single segment output (useful for streaming)
498
+ bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
499
+ bool print_progress; // print progress information
500
+ bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
501
+ bool print_timestamps; // print timestamps for each text segment when printing realtime
502
+
503
+ // [EXPERIMENTAL] token-level timestamps
504
+ bool token_timestamps; // enable token-level timestamps
505
+ float thold_pt; // timestamp token probability threshold (~0.01)
506
+ float thold_ptsum; // timestamp token sum probability threshold (~0.01)
507
+ int max_len; // max segment length in characters
508
+ bool split_on_word; // split on word rather than on token (when used with max_len)
509
+ int max_tokens; // max tokens per segment (0 = no limit)
510
+
511
+ // [EXPERIMENTAL] speed-up techniques
512
+ // note: these can significantly reduce the quality of the output
513
+ bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
514
+ int audio_ctx; // overwrite the audio context size (0 = use default)
515
+
516
+ // [EXPERIMENTAL] [TDRZ] tinydiarize
517
+ bool tdrz_enable; // enable tinydiarize speaker turn detection
518
+
519
+ // A regular expression that matches tokens to suppress
520
+ const char * suppress_regex;
521
+
522
+ // tokens to provide to the whisper decoder as initial prompt
523
+ // these are prepended to any existing text context from a previous call
524
+ // use whisper_tokenize() to convert text to tokens
525
+ // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
526
+ const char * initial_prompt;
527
+ const whisper_token * prompt_tokens;
528
+ int prompt_n_tokens;
529
+
530
+ // for auto-detection, set to nullptr, "" or "auto"
531
+ const char * language;
532
+ bool detect_language;
533
+
534
+ // common decoding parameters:
535
+ bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
536
+ bool suppress_nst; // non-speech tokens, ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
537
+
538
+ float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
539
+ float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
540
+ float length_penalty; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
541
+
542
+ // fallback parameters
543
+ // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
544
+ float temperature_inc;
545
+ float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
546
+ float logprob_thold;
547
+ float no_speech_thold;
548
+
549
+ struct {
550
+ int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
551
+ } greedy;
552
+
553
+ struct {
554
+ int beam_size; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
555
+
556
+ float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
557
+ } beam_search;
558
+
559
+ // called for every newly generated text segment
560
+ whisper_new_segment_callback new_segment_callback;
561
+ void * new_segment_callback_user_data;
562
+
563
+ // called on each progress update
564
+ whisper_progress_callback progress_callback;
565
+ void * progress_callback_user_data;
566
+
567
+ // called each time before the encoder starts
568
+ whisper_encoder_begin_callback encoder_begin_callback;
569
+ void * encoder_begin_callback_user_data;
570
+
571
+ // called each time before ggml computation starts
572
+ wsp_ggml_abort_callback abort_callback;
573
+ void * abort_callback_user_data;
574
+
575
+ // called by each decoder to filter obtained logits
576
+ whisper_logits_filter_callback logits_filter_callback;
577
+ void * logits_filter_callback_user_data;
578
+
579
+ const whisper_grammar_element ** grammar_rules;
580
+ size_t n_grammar_rules;
581
+ size_t i_start_rule;
582
+ float grammar_penalty;
583
+
584
+ // Voice Activity Detection (VAD) params
585
+ bool vad; // Enable VAD
586
+ const char * vad_model_path; // Path to VAD model
587
+
588
+ whisper_vad_params vad_params;
589
+ };
590
+
591
+ // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
592
+ WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(void);
593
+ WHISPER_API struct whisper_context_params whisper_context_default_params (void);
594
+
595
+ WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
596
+ WHISPER_API struct whisper_full_params whisper_full_default_params (enum whisper_sampling_strategy strategy);
597
+
598
+ // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
599
+ // Not thread safe for same context
600
+ // Uses the specified decoding strategy to obtain the text.
601
+ WHISPER_API int whisper_full(
602
+ struct whisper_context * ctx,
603
+ struct whisper_full_params params,
604
+ const float * samples,
605
+ int n_samples);
606
+
607
+ WHISPER_API int whisper_full_with_state(
608
+ struct whisper_context * ctx,
609
+ struct whisper_state * state,
610
+ struct whisper_full_params params,
611
+ const float * samples,
612
+ int n_samples);
613
+
614
+ // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
615
+ // Result is stored in the default state of the context
616
+ // Not thread safe if executed in parallel on the same context.
617
+ // It seems this approach can offer some speedup in some cases.
618
+ // However, the transcription accuracy can be worse at the beginning and end of each chunk.
619
+ WHISPER_API int whisper_full_parallel(
620
+ struct whisper_context * ctx,
621
+ struct whisper_full_params params,
622
+ const float * samples,
623
+ int n_samples,
624
+ int n_processors);
625
+
626
+ // Number of generated text segments
627
+ // A segment can be a few words, a sentence, or even a paragraph.
628
+ WHISPER_API int whisper_full_n_segments (struct whisper_context * ctx);
629
+ WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
630
+
631
+ // Language id associated with the context's default state
632
+ WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
633
+
634
+ // Language id associated with the provided state
635
+ WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
636
+
637
+ // Get the start and end time of the specified segment
638
+ WHISPER_API int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx, int i_segment);
639
+ WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
640
+
641
+ WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
642
+ WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
643
+
644
+ // Get whether the next segment is predicted as a speaker turn
645
+ WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
646
+ WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);
647
+
648
+ // Get the text of the specified segment
649
+ WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
650
+ WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
651
+
652
+ // Get number of tokens in the specified segment
653
+ WHISPER_API int whisper_full_n_tokens (struct whisper_context * ctx, int i_segment);
654
+ WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
655
+
656
+ // Get the token text of the specified token in the specified segment
657
+ WHISPER_API const char * whisper_full_get_token_text (struct whisper_context * ctx, int i_segment, int i_token);
658
+ WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
659
+
660
+ WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
661
+ WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
662
+
663
+ // Get token data for the specified token in the specified segment
664
+ // This contains probabilities, timestamps, etc.
665
+ WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token);
666
+ WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
667
+
668
+ // Get the probability of the specified token in the specified segment
669
+ WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
670
+ WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
671
+
672
+ //
673
+ // Voice Activity Detection (VAD)
674
+ //
675
+
676
+ struct whisper_vad_context;
677
+
678
+ WHISPER_API struct whisper_vad_params whisper_vad_default_params(void);
679
+
680
+ struct whisper_vad_context_params {
681
+ int n_threads; // The number of threads to use for processing.
682
+ bool use_gpu;
683
+ int gpu_device; // CUDA device
684
+ };
685
+
686
+ WHISPER_API struct whisper_vad_context_params whisper_vad_default_context_params(void);
687
+
688
+ WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params(const char * path_model, struct whisper_vad_context_params params);
689
+ WHISPER_API struct whisper_vad_context * whisper_vad_init_with_params (struct whisper_model_loader * loader, struct whisper_vad_context_params params);
690
+
691
+ WHISPER_API bool whisper_vad_detect_speech(
692
+ struct whisper_vad_context * vctx,
693
+ const float * samples,
694
+ int n_samples);
695
+
696
+ WHISPER_API int whisper_vad_n_probs(struct whisper_vad_context * vctx);
697
+ WHISPER_API float * whisper_vad_probs (struct whisper_vad_context * vctx);
698
+
699
+ struct whisper_vad_segments;
700
+
701
+ WHISPER_API struct whisper_vad_segments * whisper_vad_segments_from_probs(
702
+ struct whisper_vad_context * vctx,
703
+ struct whisper_vad_params params);
704
+
705
+ WHISPER_API struct whisper_vad_segments * whisper_vad_segments_from_samples(
706
+ struct whisper_vad_context * vctx,
707
+ struct whisper_vad_params params,
708
+ const float * samples,
709
+ int n_samples);
710
+
711
+ WHISPER_API int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments);
712
+
713
+ WHISPER_API float whisper_vad_segments_get_segment_t0(struct whisper_vad_segments * segments, int i_segment);
714
+ WHISPER_API float whisper_vad_segments_get_segment_t1(struct whisper_vad_segments * segments, int i_segment);
715
+
716
+ WHISPER_API void whisper_vad_free_segments(struct whisper_vad_segments * segments);
717
+ WHISPER_API void whisper_vad_free (struct whisper_vad_context * ctx);
718
+
719
+ ////////////////////////////////////////////////////////////////////////////
720
+
721
+ // Temporary helpers needed for exposing ggml interface
722
+
723
+ WHISPER_API int whisper_bench_memcpy (int n_threads);
724
+ WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
725
+ WHISPER_API int whisper_bench_wsp_ggml_mul_mat (int n_threads);
726
+ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads);
727
+
728
+ // Control logging output; default behavior is to print to stderr
729
+
730
+ WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
731
+
732
+ // Get the no_speech probability for the specified segment
733
+ WHISPER_API float whisper_full_get_segment_no_speech_prob (struct whisper_context * ctx, int i_segment);
734
+ WHISPER_API float whisper_full_get_segment_no_speech_prob_from_state(struct whisper_state * state, int i_segment);
735
+ #ifdef __cplusplus
736
+ }
737
+ #endif
738
+
739
+ #endif