whisper.rn 0.5.0-rc.9 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/cpp/ggml-alloc.c +265 -141
  4. package/cpp/ggml-backend-impl.h +4 -1
  5. package/cpp/ggml-backend-reg.cpp +30 -13
  6. package/cpp/ggml-backend.cpp +221 -38
  7. package/cpp/ggml-backend.h +17 -1
  8. package/cpp/ggml-common.h +17 -0
  9. package/cpp/ggml-cpu/amx/amx.cpp +4 -2
  10. package/cpp/ggml-cpu/arch/arm/quants.c +132 -596
  11. package/cpp/ggml-cpu/arch/arm/repack.cpp +14 -286
  12. package/cpp/ggml-cpu/arch/x86/quants.c +184 -675
  13. package/cpp/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  14. package/cpp/ggml-cpu/arch-fallback.h +32 -2
  15. package/cpp/ggml-cpu/common.h +14 -0
  16. package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
  17. package/cpp/ggml-cpu/ggml-cpu.c +70 -42
  18. package/cpp/ggml-cpu/ggml-cpu.cpp +35 -28
  19. package/cpp/ggml-cpu/ops.cpp +1587 -1177
  20. package/cpp/ggml-cpu/ops.h +5 -8
  21. package/cpp/ggml-cpu/quants.c +35 -0
  22. package/cpp/ggml-cpu/quants.h +8 -0
  23. package/cpp/ggml-cpu/repack.cpp +458 -47
  24. package/cpp/ggml-cpu/repack.h +22 -0
  25. package/cpp/ggml-cpu/simd-mappings.h +89 -60
  26. package/cpp/ggml-cpu/traits.cpp +2 -2
  27. package/cpp/ggml-cpu/traits.h +1 -1
  28. package/cpp/ggml-cpu/vec.cpp +170 -26
  29. package/cpp/ggml-cpu/vec.h +506 -63
  30. package/cpp/ggml-cpu.h +1 -1
  31. package/cpp/ggml-impl.h +119 -9
  32. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  33. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  34. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  35. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  36. package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
  37. package/cpp/ggml-metal/ggml-metal-device.h +226 -0
  38. package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
  39. package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
  40. package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
  41. package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
  42. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  43. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  44. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  45. package/cpp/ggml-metal-impl.h +90 -51
  46. package/cpp/ggml-metal.h +1 -6
  47. package/cpp/ggml-opt.cpp +97 -41
  48. package/cpp/ggml-opt.h +25 -6
  49. package/cpp/ggml-quants.c +111 -16
  50. package/cpp/ggml-quants.h +6 -0
  51. package/cpp/ggml.c +486 -98
  52. package/cpp/ggml.h +221 -16
  53. package/cpp/gguf.cpp +8 -1
  54. package/cpp/jsi/RNWhisperJSI.cpp +25 -6
  55. package/cpp/jsi/ThreadPool.h +3 -3
  56. package/cpp/whisper.cpp +100 -76
  57. package/cpp/whisper.h +1 -0
  58. package/ios/CMakeLists.txt +6 -1
  59. package/ios/RNWhisper.mm +6 -6
  60. package/ios/RNWhisperContext.mm +2 -0
  61. package/ios/RNWhisperVadContext.mm +16 -13
  62. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  63. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  64. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
  65. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  66. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +119 -9
  67. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
  68. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  69. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  70. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  71. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +221 -16
  72. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  73. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  74. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  75. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  76. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  77. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  78. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
  79. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  80. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +119 -9
  81. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
  82. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  83. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  84. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  85. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +221 -16
  86. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  87. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  88. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  89. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  90. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  91. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  92. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  93. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
  94. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  95. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +119 -9
  96. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
  97. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  98. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  99. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  100. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +221 -16
  101. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  102. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  103. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  104. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  105. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  106. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  107. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
  108. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  109. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +119 -9
  110. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
  111. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  112. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
  113. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
  114. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +221 -16
  115. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  116. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  117. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  118. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  119. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  120. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +13 -0
  121. package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -1
  122. package/lib/commonjs/version.json +1 -1
  123. package/lib/module/realtime-transcription/RealtimeTranscriber.js +13 -0
  124. package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -1
  125. package/lib/module/version.json +1 -1
  126. package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -1
  127. package/lib/typescript/realtime-transcription/types.d.ts +6 -0
  128. package/lib/typescript/realtime-transcription/types.d.ts.map +1 -1
  129. package/package.json +1 -1
  130. package/src/realtime-transcription/RealtimeTranscriber.ts +17 -0
  131. package/src/realtime-transcription/types.ts +6 -0
  132. package/src/version.json +1 -1
  133. package/whisper-rn.podspec +8 -9
  134. package/cpp/ggml-metal.m +0 -6284
  135. package/cpp/ggml-whisper-sim.metallib +0 -0
  136. package/cpp/ggml-whisper.metallib +0 -0
package/cpp/whisper.cpp CHANGED
@@ -21,14 +21,12 @@
21
21
  #define _USE_MATH_DEFINES
22
22
  #include <cmath>
23
23
  #include <climits>
24
- #include <codecvt>
25
24
  #include <cstdarg>
26
25
  #include <cstdio>
27
26
  #include <cstring>
28
27
  #include <fstream>
29
28
  #include <functional>
30
29
  #include <map>
31
- #include <mutex>
32
30
  #include <random>
33
31
  #include <regex>
34
32
  #include <set>
@@ -36,6 +34,10 @@
36
34
  #include <thread>
37
35
  #include <vector>
38
36
 
37
+ #ifdef _MSC_VER
38
+ #include <codecvt>
39
+ #endif
40
+
39
41
  #if defined(WHISPER_BIG_ENDIAN)
40
42
  template<typename T>
41
43
  static T byteswap(T value) {
@@ -138,6 +140,10 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
138
140
  } while (0)
139
141
 
140
142
  #define WHISPER_MAX_DECODERS 8
143
+
144
+ // temperature below which we condition on past text history
145
+ static constexpr float WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF = 0.5f;
146
+
141
147
  #define WHISPER_MAX_NODES 4096
142
148
 
143
149
  static std::string format(const char * fmt, ...) {
@@ -252,45 +258,6 @@ static void whisper_set_i32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i
252
258
  *(int32_t *) data = v;
253
259
  }
254
260
 
255
- // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
256
- // the idea is to represent the original matrix multiplication:
257
- //
258
- // Z = X @ Y
259
- //
260
- // with the sum of two matrix multiplications:
261
- //
262
- // Z = (X_0 @ Y_0) + (X_1 @ Y_1)
263
- //
264
- // here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
265
- // and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
266
- // general-purpose kernels
267
- //
268
- static struct wsp_ggml_tensor * wsp_ggml_mul_mat_pad(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * x, struct wsp_ggml_tensor * y, int pad = 32) {
269
- // use padding only if dimension 0 is at least 8 times larger than the padding
270
- // else we won't get much benefit from the optimization
271
- const int n_pad_req = 8;
272
-
273
- if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
274
- return wsp_ggml_mul_mat(ctx, x, y);
275
- }
276
-
277
- struct wsp_ggml_tensor * x_0 = wsp_ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
278
- struct wsp_ggml_tensor * x_1 = wsp_ggml_view_3d(ctx, x, x->ne[0]%pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
279
-
280
- struct wsp_ggml_tensor * y_0 = wsp_ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
281
- struct wsp_ggml_tensor * y_1 = wsp_ggml_view_3d(ctx, y, y->ne[0]%pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
282
-
283
- return wsp_ggml_add(ctx,
284
- wsp_ggml_mul_mat(ctx, x_0, y_0),
285
- wsp_ggml_mul_mat(ctx, x_1, y_1));
286
- }
287
-
288
- // TODO: check if other platforms can benefit from this optimization
289
- // TODO: CUDA is currently broken - seems wsp_ggml_mul_mat does not handle views correctly
290
- #if defined(WSP_GGML_USE_METAL)
291
- #define wsp_ggml_mul_mat wsp_ggml_mul_mat_pad
292
- #endif
293
-
294
261
  // available whisper models
295
262
  enum e_model {
296
263
  MODEL_UNKNOWN,
@@ -919,7 +886,10 @@ struct whisper_state {
919
886
  std::vector<float> logits;
920
887
 
921
888
  std::vector<whisper_segment> result_all;
922
- std::vector<whisper_token> prompt_past;
889
+
890
+ // prompt history split into static prefix (prompt_past0) and dynamic rolling context (prompt_past1)
891
+ std::vector<whisper_token> prompt_past0; // static carried initial prompt (if enabled)
892
+ std::vector<whisper_token> prompt_past1; // dynamic context from decoded output
923
893
 
924
894
  int lang_id = 0; // english by default
925
895
 
@@ -1327,7 +1297,7 @@ static wsp_ggml_backend_t whisper_backend_init_gpu(const whisper_context_params
1327
1297
  for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
1328
1298
  wsp_ggml_backend_dev_t dev_cur = wsp_ggml_backend_dev_get(i);
1329
1299
  if (wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
1330
- if (cnt == 0 || cnt == params.gpu_device) {
1300
+ if (cnt == params.gpu_device) {
1331
1301
  dev = dev_cur;
1332
1302
  }
1333
1303
 
@@ -1396,7 +1366,7 @@ static buft_list_t make_buft_list(whisper_context_params & params) {
1396
1366
  for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
1397
1367
  wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
1398
1368
  if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
1399
- if (cnt == 0 || cnt == params.gpu_device) {
1369
+ if (cnt == params.gpu_device) {
1400
1370
  auto * buft = wsp_ggml_backend_dev_buffer_type(dev);
1401
1371
  if (buft) {
1402
1372
  buft_list.emplace_back(dev, buft);
@@ -1438,7 +1408,8 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
1438
1408
  op_supported = true;
1439
1409
  } else {
1440
1410
  switch (op) {
1441
- // The current extra_buffer_type implementations only support WSP_GGML_OP_MUL_MAT
1411
+ // The current extra_buffer_type implementations only support WSP_GGML_OP_MUL_MAT and WSP_GGML_OP_GET_ROWS
1412
+ case WSP_GGML_OP_GET_ROWS:
1442
1413
  case WSP_GGML_OP_MUL_MAT: {
1443
1414
  wsp_ggml_init_params params = {
1444
1415
  /*.mem_size =*/ 2 * wsp_ggml_tensor_overhead(),
@@ -1454,9 +1425,15 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
1454
1425
 
1455
1426
  wsp_ggml_tensor * op_tensor = nullptr;
1456
1427
 
1457
- int64_t n_ctx = hparams.n_audio_ctx;
1458
- wsp_ggml_tensor * b = wsp_ggml_new_tensor_4d(ctx, WSP_GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
1459
- op_tensor = wsp_ggml_mul_mat(ctx, w, b);
1428
+ if (op == WSP_GGML_OP_MUL_MAT) {
1429
+ int64_t n_ctx = hparams.n_audio_ctx;
1430
+ wsp_ggml_tensor * b = wsp_ggml_new_tensor_4d(ctx, WSP_GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
1431
+ op_tensor = wsp_ggml_mul_mat(ctx, w, b);
1432
+ } else if (op == WSP_GGML_OP_GET_ROWS) {
1433
+ int64_t num_indices = 8;
1434
+ wsp_ggml_tensor * indices = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, num_indices);
1435
+ op_tensor = wsp_ggml_get_rows(ctx, w, indices);
1436
+ }
1460
1437
 
1461
1438
  // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
1462
1439
  WSP_GGML_ASSERT(w->buffer == nullptr);
@@ -2425,6 +2402,8 @@ static bool whisper_encode_internal(
2425
2402
  return false;
2426
2403
  }
2427
2404
  } else {
2405
+ wsp_ggml_backend_sched_reset(sched);
2406
+
2428
2407
  #if defined(WHISPER_USE_COREML)
2429
2408
  whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
2430
2409
  #elif defined(WHISPER_USE_OPENVINO)
@@ -3626,7 +3605,7 @@ struct whisper_context_params whisper_context_default_params() {
3626
3605
  struct whisper_context_params result = {
3627
3606
  /*.use_gpu =*/ true,
3628
3607
  /*.use_coreml =*/ false,
3629
- /*.flash_attn =*/ false,
3608
+ /*.flash_attn =*/ true,
3630
3609
  /*.gpu_device =*/ 0,
3631
3610
 
3632
3611
  /*.dtw_token_timestamps =*/ false,
@@ -4710,6 +4689,7 @@ static bool whisper_vad_init_context(whisper_vad_context * vctx) {
4710
4689
  wsp_ggml_set_name(vctx->c_state, "c_state");
4711
4690
 
4712
4691
  vctx->buffer = wsp_ggml_backend_alloc_ctx_tensors(ctx, vctx->backends[0]);
4692
+ wsp_ggml_free(ctx);
4713
4693
  if (!vctx->buffer) {
4714
4694
  WHISPER_LOG_ERROR("%s: failed to allocate memory for the VAD state\n", __func__);
4715
4695
  return false;
@@ -5454,6 +5434,9 @@ struct whisper_vad_segments * whisper_vad_segments_from_samples(
5454
5434
 
5455
5435
  void whisper_vad_free(whisper_vad_context * ctx) {
5456
5436
  if (ctx) {
5437
+ if (ctx->buffer) {
5438
+ wsp_ggml_backend_buffer_free(ctx->buffer);
5439
+ }
5457
5440
  for (wsp_ggml_context * context : ctx->model.ctxs) {
5458
5441
  wsp_ggml_free(context);
5459
5442
  }
@@ -5468,6 +5451,9 @@ void whisper_vad_free(whisper_vad_context * ctx) {
5468
5451
  wsp_ggml_backend_free(backend);
5469
5452
  }
5470
5453
 
5454
+ delete[] ctx->model.hparams.encoder_in_channels;
5455
+ delete[] ctx->model.hparams.encoder_out_channels;
5456
+ delete[] ctx->model.hparams.kernel_sizes;
5471
5457
 
5472
5458
  delete ctx;
5473
5459
  }
@@ -5947,9 +5933,10 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
5947
5933
 
5948
5934
  /* suppress_regex =*/ nullptr,
5949
5935
 
5950
- /*.initial_prompt =*/ nullptr,
5951
- /*.prompt_tokens =*/ nullptr,
5952
- /*.prompt_n_tokens =*/ 0,
5936
+ /*.initial_prompt =*/ nullptr,
5937
+ /*.carry_initial_prompt =*/ false,
5938
+ /*.prompt_tokens =*/ nullptr,
5939
+ /*.prompt_n_tokens =*/ 0,
5953
5940
 
5954
5941
  /*.language =*/ "en",
5955
5942
  /*.detect_language =*/ false,
@@ -6645,6 +6632,10 @@ static bool whisper_vad(
6645
6632
 
6646
6633
  whisper_vad_segments * vad_segments = whisper_vad_segments_from_samples(vctx, vad_params, samples, n_samples);
6647
6634
 
6635
+ if (!vad_segments) {
6636
+ return false;
6637
+ }
6638
+
6648
6639
  if (vad_segments->data.size() > 0) {
6649
6640
  state->has_vad_segments = true;
6650
6641
  ctx->state->vad_segments.clear();
@@ -6687,7 +6678,6 @@ static bool whisper_vad(
6687
6678
  } catch (const std::bad_alloc & /* e */) {
6688
6679
  WHISPER_LOG_ERROR("%s: failed to allocate memory for filtered samples\n", __func__);
6689
6680
  whisper_vad_free_segments(vad_segments);
6690
- whisper_vad_free(vctx);
6691
6681
  return false;
6692
6682
  }
6693
6683
 
@@ -6793,6 +6783,7 @@ static bool whisper_vad(
6793
6783
  __func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
6794
6784
  }
6795
6785
 
6786
+ whisper_vad_free_segments(vad_segments);
6796
6787
  return true;
6797
6788
  }
6798
6789
 
@@ -6901,17 +6892,22 @@ int whisper_full_with_state(
6901
6892
  decoder.rng = std::mt19937(j);
6902
6893
  }
6903
6894
 
6904
- // the accumulated text context so far
6905
- auto & prompt_past = state->prompt_past;
6895
+ // the accumulated text context split into static (prompt_past0) and dynamic (prompt_past1)
6896
+ auto & prompt_past0 = state->prompt_past0;
6897
+ auto & prompt_past1 = state->prompt_past1;
6906
6898
  if (params.no_context) {
6907
- prompt_past.clear();
6899
+ prompt_past0.clear();
6900
+ prompt_past1.clear();
6908
6901
  }
6909
6902
 
6903
+ // calculate the maximum context budget for prompt history
6904
+ const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
6905
+
6910
6906
  // prepare prompt
6911
6907
  {
6912
6908
  std::vector<whisper_token> prompt_tokens;
6913
6909
 
6914
- // initial prompt
6910
+ // tokenize the initial prompt
6915
6911
  if (!params.prompt_tokens && params.initial_prompt) {
6916
6912
  prompt_tokens.resize(1024);
6917
6913
  int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size());
@@ -6923,14 +6919,25 @@ int whisper_full_with_state(
6923
6919
  params.prompt_tokens = prompt_tokens.data();
6924
6920
  params.prompt_n_tokens = prompt_tokens.size();
6925
6921
  }
6926
-
6927
- // prepend the prompt tokens to the prompt_past
6928
6922
  if (params.prompt_tokens && params.prompt_n_tokens > 0) {
6929
- // parse tokens from the pointer
6930
- for (int i = 0; i < params.prompt_n_tokens; i++) {
6931
- prompt_past.push_back(params.prompt_tokens[i]);
6923
+ if (params.carry_initial_prompt) {
6924
+ if (prompt_past0.empty()) {
6925
+ const int max_tokens = std::max(1, max_prompt_ctx - 1);
6926
+
6927
+ if (params.prompt_n_tokens > max_tokens) {
6928
+ WHISPER_LOG_WARN("%s: initial prompt is too long (%d tokens), will use only the last %d tokens\n",
6929
+ __func__, params.prompt_n_tokens, max_tokens);
6930
+ }
6931
+
6932
+ const int n_tokens = std::min(params.prompt_n_tokens, max_tokens);
6933
+ prompt_past0.assign(params.prompt_tokens + (params.prompt_n_tokens - n_tokens), params.prompt_tokens + params.prompt_n_tokens);
6934
+ }
6935
+ } else {
6936
+ for (int i = 0; i < params.prompt_n_tokens; ++i) {
6937
+ prompt_past1.push_back(params.prompt_tokens[i]);
6938
+ }
6939
+ std::rotate(prompt_past1.begin(), prompt_past1.end() - params.prompt_n_tokens, prompt_past1.end());
6932
6940
  }
6933
- std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
6934
6941
  }
6935
6942
  }
6936
6943
 
@@ -7016,7 +7023,8 @@ int whisper_full_with_state(
7016
7023
  // if there is a very short audio segment left to process, we remove any past prompt since it tends
7017
7024
  // to confuse the decoder and often make it repeat or hallucinate stuff
7018
7025
  if (seek > seek_start && seek + 500 >= seek_end) {
7019
- prompt_past.clear();
7026
+ prompt_past0.clear();
7027
+ prompt_past1.clear();
7020
7028
  }
7021
7029
 
7022
7030
  int best_decoder_id = 0;
@@ -7077,12 +7085,25 @@ int whisper_full_with_state(
7077
7085
  {
7078
7086
  prompt.clear();
7079
7087
 
7080
- // if we have already generated some text, use it as a prompt to condition the next generation
7081
- if (!prompt_past.empty() && t_cur < 0.5f && params.n_max_text_ctx > 0) {
7082
- int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
7088
+ if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
7089
+ const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty();
7090
+ const bool can_take1 = !prompt_past1.empty();
7083
7091
 
7084
- prompt = { whisper_token_prev(ctx) };
7085
- prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end());
7092
+ if (max_prompt_ctx > 0 && (can_take0 || can_take1)) {
7093
+ // Always start with previous token marker to connect continuity
7094
+ prompt.push_back(whisper_token_prev(ctx));
7095
+
7096
+ // Take static tokens (initial prompt) first
7097
+ int n_take0 = 0;
7098
+ if (can_take0) {
7099
+ n_take0 = prompt_past0.size();
7100
+ prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end());
7101
+ }
7102
+
7103
+ // Fill remaining budget with dynamic tokens (rolling context)
7104
+ const int n_take1 = std::min<int>(max_prompt_ctx - n_take0 - 1, prompt_past1.size());
7105
+ prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
7106
+ }
7086
7107
  }
7087
7108
 
7088
7109
  // init new transcription with sot, language (opt) and task tokens
@@ -7564,14 +7585,17 @@ int whisper_full_with_state(
7564
7585
 
7565
7586
  //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
7566
7587
 
7567
- // update prompt_past
7568
- prompt_past.clear();
7569
- if (prompt.front() == whisper_token_prev(ctx)) {
7570
- prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
7588
+ // update prompt_past1
7589
+ prompt_past1.clear();
7590
+ if (!params.carry_initial_prompt && !prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
7591
+ prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
7571
7592
  }
7572
7593
 
7573
- for (int i = 0; i < result_len && !is_no_speech; ++i) {
7574
- prompt_past.push_back(tokens_cur[i].id);
7594
+ // Add newly decoded tokens to the rolling context
7595
+ if (!is_no_speech) {
7596
+ for (int i = 0; i < result_len; ++i) {
7597
+ prompt_past1.push_back(tokens_cur[i].id);
7598
+ }
7575
7599
  }
7576
7600
 
7577
7601
  if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) {
@@ -8943,7 +8967,7 @@ void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data) {
8943
8967
  }
8944
8968
 
8945
8969
  const char * whisper_version(void) {
8946
- return "1.7.6";
8970
+ return "1.8.0";
8947
8971
  }
8948
8972
 
8949
8973
  WSP_GGML_ATTRIBUTE_FORMAT(2, 3)
package/cpp/whisper.h CHANGED
@@ -526,6 +526,7 @@ extern "C" {
526
526
  // use whisper_tokenize() to convert text to tokens
527
527
  // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
528
528
  const char * initial_prompt;
529
+ bool carry_initial_prompt; // if true, always prepend initial_prompt to every decode window (may reduce conditioning on previous text)
529
530
  const whisper_token * prompt_tokens;
530
531
  int prompt_n_tokens;
531
532
 
@@ -55,7 +55,12 @@ add_library(rnwhisper SHARED
55
55
  ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
56
56
  ${SOURCE_DIR}/ggml-cpu/vec.cpp
57
57
  ${SOURCE_DIR}/ggml-cpu/ops.cpp
58
- ${SOURCE_DIR}/ggml-metal.m
58
+ ${SOURCE_DIR}/ggml-metal/ggml-metal.cpp
59
+ ${SOURCE_DIR}/ggml-metal/ggml-metal-common.cpp
60
+ ${SOURCE_DIR}/ggml-metal/ggml-metal-device.cpp
61
+ ${SOURCE_DIR}/ggml-metal/ggml-metal-context.m
62
+ ${SOURCE_DIR}/ggml-metal/ggml-metal-device.m
63
+ ${SOURCE_DIR}/ggml-metal/ggml-metal-ops.cpp
59
64
  ${SOURCE_DIR}/ggml-opt.cpp
60
65
  ${SOURCE_DIR}/ggml-threading.cpp
61
66
  ${SOURCE_DIR}/ggml-quants.c
package/ios/RNWhisper.mm CHANGED
@@ -357,10 +357,9 @@ RCT_REMAP_METHOD(releaseContext,
357
357
  reject(@"whisper_error", @"Context not found", nil);
358
358
  return;
359
359
  }
360
- [context invalidate];
361
- [contexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
362
- // Also remove from unified context management
363
360
  rnwhisper_jsi::removeContext(contextId);
361
+ [contexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
362
+ [context invalidate];
364
363
  resolve(nil);
365
364
  }
366
365
 
@@ -555,10 +554,9 @@ RCT_REMAP_METHOD(releaseVadContext,
555
554
  reject(@"whisper_vad_error", @"VAD context not found", nil);
556
555
  return;
557
556
  }
558
- [vadContext invalidate];
559
- [vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
560
- // Also remove from unified context management
561
557
  rnwhisper_jsi::removeVadContext(contextId);
558
+ [vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
559
+ [vadContext invalidate];
562
560
  resolve(nil);
563
561
  }
564
562
 
@@ -574,6 +572,7 @@ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
574
572
  if (contexts != nil) {
575
573
  for (NSNumber *contextId in contexts) {
576
574
  RNWhisperContext *context = contexts[contextId];
575
+ rnwhisper_jsi::removeContext([contextId intValue]);
577
576
  [context invalidate];
578
577
  }
579
578
  [contexts removeAllObjects];
@@ -585,6 +584,7 @@ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
585
584
  if (vadContexts != nil) {
586
585
  for (NSNumber *contextId in vadContexts) {
587
586
  RNWhisperVadContext *vadContext = vadContexts[contextId];
587
+ rnwhisper_jsi::removeVadContext([contextId intValue]);
588
588
  [vadContext invalidate];
589
589
  }
590
590
  [vadContexts removeAllObjects];
@@ -87,6 +87,8 @@ static void* retained_log_block = nullptr;
87
87
 
88
88
  #ifdef WSP_GGML_USE_METAL
89
89
  if (cparams.use_gpu) {
90
+ cparams.gpu_device = 0;
91
+
90
92
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
91
93
 
92
94
  // Check ggml-metal availability
@@ -20,25 +20,28 @@
20
20
 
21
21
  #ifdef WSP_GGML_USE_METAL
22
22
  if (ctx_params.use_gpu) {
23
- id<MTLDevice> device = MTLCreateSystemDefaultDevice();
24
-
25
- // Check ggml-metal availability
26
- BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
27
- if (@available(iOS 16.0, tvOS 16.0, *)) {
28
- supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
29
- }
30
- if (!supportsGgmlMetal) {
31
- ctx_params.use_gpu = false;
32
- reasonNoMetal = @"Metal is not supported in this device";
33
- }
23
+ // TODO: GPU VAD is forced disabled until the performance is improved (ref: whisper.cpp/whisper_vad_init_context)
24
+ ctx_params.use_gpu = false;
25
+ // ctx_params.gpu_device = 0;
26
+
27
+ // id<MTLDevice> device = MTLCreateSystemDefaultDevice();
28
+
29
+ // // Check ggml-metal availability
30
+ // BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
31
+ // if (@available(iOS 16.0, tvOS 16.0, *)) {
32
+ // supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
33
+ // }
34
+ // if (!supportsGgmlMetal) {
35
+ // ctx_params.use_gpu = false;
36
+ // reasonNoMetal = @"Metal is not supported in this device";
37
+ // }
38
+ // device = nil;
34
39
 
35
40
  #if TARGET_OS_SIMULATOR
36
41
  // Use the backend, but no layers because not supported fully on simulator
37
42
  ctx_params.use_gpu = false;
38
43
  reasonNoMetal = @"Metal is not supported in simulator";
39
44
  #endif
40
-
41
- device = nil;
42
45
  }
43
46
  #endif // WSP_GGML_USE_METAL
44
47
 
@@ -8,7 +8,7 @@
8
8
  extern "C" {
9
9
  #endif
10
10
 
11
- #define WSP_GGML_BACKEND_API_VERSION 1
11
+ #define WSP_GGML_BACKEND_API_VERSION 2
12
12
 
13
13
  //
14
14
  // Backend buffer type
@@ -114,6 +114,9 @@ extern "C" {
114
114
  void (*event_record)(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event);
115
115
  // wait for an event on on a different stream
116
116
  void (*event_wait) (wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event);
117
+
118
+ // (optional) sort/optimize the nodes in the graph
119
+ void (*graph_optimize) (wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
117
120
  };
118
121
 
119
122
  struct wsp_ggml_backend {
@@ -132,6 +132,8 @@ extern "C" {
132
132
  WSP_GGML_BACKEND_DEVICE_TYPE_CPU,
133
133
  // GPU device using dedicated memory
134
134
  WSP_GGML_BACKEND_DEVICE_TYPE_GPU,
135
+ // integrated GPU device using host memory
136
+ WSP_GGML_BACKEND_DEVICE_TYPE_IGPU,
135
137
  // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136
138
  WSP_GGML_BACKEND_DEVICE_TYPE_ACCEL
137
139
  };
@@ -150,11 +152,21 @@ extern "C" {
150
152
 
151
153
  // all the device properties
152
154
  struct wsp_ggml_backend_dev_props {
155
+ // device name
153
156
  const char * name;
157
+ // device description
154
158
  const char * description;
159
+ // device free memory in bytes
155
160
  size_t memory_free;
161
+ // device total memory in bytes
156
162
  size_t memory_total;
163
+ // device type
157
164
  enum wsp_ggml_backend_dev_type type;
165
+ // device id
166
+ // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
167
+ // if the id is unknown, this should be NULL
168
+ const char * device_id;
169
+ // device capabilities
158
170
  struct wsp_ggml_backend_dev_caps caps;
159
171
  };
160
172
 
@@ -302,11 +314,15 @@ extern "C" {
302
314
  WSP_GGML_API int wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched);
303
315
  WSP_GGML_API int wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched);
304
316
 
305
- WSP_GGML_API size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
317
+ WSP_GGML_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
318
+ WSP_GGML_API size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
306
319
 
307
320
  WSP_GGML_API void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend);
308
321
  WSP_GGML_API wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node);
309
322
 
323
+ // Split graph without allocating it
324
+ WSP_GGML_API void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
325
+
310
326
  // Allocate and compute graph on the backend scheduler
311
327
  WSP_GGML_API bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph); // returns success
312
328
  WSP_GGML_API enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
@@ -99,6 +99,9 @@ typedef sycl::half2 wsp_ggml_half2;
99
99
  #define QI4_1 (QK4_1 / (4 * QR4_1))
100
100
  #define QR4_1 2
101
101
 
102
+ #define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
103
+ #define QR_MXFP4 2
104
+
102
105
  #define QI5_0 (QK5_0 / (4 * QR5_0))
103
106
  #define QR5_0 2
104
107
 
@@ -184,6 +187,13 @@ typedef struct {
184
187
  } block_q4_1;
185
188
  static_assert(sizeof(block_q4_1) == 2 * sizeof(wsp_ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
186
189
 
190
+ #define QK_MXFP4 32
191
+ typedef struct {
192
+ uint8_t e; // E8M0
193
+ uint8_t qs[QK_MXFP4/2];
194
+ } block_mxfp4;
195
+ static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
196
+
187
197
  #define QK5_0 32
188
198
  typedef struct {
189
199
  wsp_ggml_half d; // delta
@@ -1074,10 +1084,17 @@ WSP_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
1074
1084
  0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
1075
1085
  WSP_GGML_TABLE_END()
1076
1086
 
1087
+ // TODO: fix name to kvalues_iq4_nl
1077
1088
  WSP_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
1078
1089
  -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
1079
1090
  WSP_GGML_TABLE_END()
1080
1091
 
1092
+ // e2m1 values (doubled)
1093
+ // ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
1094
+ WSP_GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
1095
+ 0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
1096
+ WSP_GGML_TABLE_END()
1097
+
1081
1098
  #define NGRID_IQ1S 2048
1082
1099
  #define IQ1S_DELTA 0.125f
1083
1100
  #define IQ1M_DELTA 0.125f
@@ -101,7 +101,6 @@ extern "C" {
101
101
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v (void);
102
102
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx (void);
103
103
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe (void);
104
- WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_nnpa (void);
105
104
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd (void);
106
105
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile (void);
107
106
 
@@ -135,6 +134,7 @@ extern "C" {
135
134
  WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
136
135
 
137
136
  WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137
+ WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
138
138
  WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp16(const float *, wsp_ggml_fp16_t *, int64_t);
139
139
  WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t *, float *, int64_t);
140
140
  WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_bf16(const float *, wsp_ggml_bf16_t *, int64_t);