whisper.rn 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/gradle.properties +1 -1
- package/android/src/main/jni.cpp +12 -3
- package/cpp/ggml-alloc.c +292 -130
- package/cpp/ggml-backend-impl.h +4 -4
- package/cpp/ggml-backend-reg.cpp +13 -5
- package/cpp/ggml-backend.cpp +207 -17
- package/cpp/ggml-backend.h +19 -1
- package/cpp/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/cpp/ggml-cpu/arch-fallback.h +0 -4
- package/cpp/ggml-cpu/common.h +14 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +14 -7
- package/cpp/ggml-cpu/ggml-cpu.c +65 -44
- package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/ggml-cpu/ops.cpp +542 -775
- package/cpp/ggml-cpu/ops.h +2 -0
- package/cpp/ggml-cpu/simd-mappings.h +88 -59
- package/cpp/ggml-cpu/unary-ops.cpp +135 -0
- package/cpp/ggml-cpu/unary-ops.h +5 -0
- package/cpp/ggml-cpu/vec.cpp +227 -20
- package/cpp/ggml-cpu/vec.h +407 -56
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-impl.h +94 -12
- package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/ggml-metal/ggml-metal-context.m +600 -0
- package/cpp/ggml-metal/ggml-metal-device.cpp +1565 -0
- package/cpp/ggml-metal/ggml-metal-device.h +244 -0
- package/cpp/ggml-metal/ggml-metal-device.m +1325 -0
- package/cpp/ggml-metal/ggml-metal-impl.h +802 -0
- package/cpp/ggml-metal/ggml-metal-ops.cpp +3583 -0
- package/cpp/ggml-metal/ggml-metal-ops.h +88 -0
- package/cpp/ggml-metal/ggml-metal.cpp +718 -0
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/cpp/ggml-metal-impl.h +40 -40
- package/cpp/ggml-metal.h +1 -6
- package/cpp/ggml-quants.c +1 -0
- package/cpp/ggml.c +341 -15
- package/cpp/ggml.h +150 -5
- package/cpp/jsi/RNWhisperJSI.cpp +9 -2
- package/cpp/jsi/ThreadPool.h +3 -3
- package/cpp/rn-whisper.h +1 -0
- package/cpp/whisper.cpp +89 -72
- package/cpp/whisper.h +1 -0
- package/ios/CMakeLists.txt +6 -1
- package/ios/RNWhisperContext.mm +3 -1
- package/ios/RNWhisperVadContext.mm +14 -13
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +19 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +94 -12
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +150 -5
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +19 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +94 -12
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +150 -5
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +19 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +94 -12
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +150 -5
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +19 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +94 -12
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +150 -5
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +2 -0
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNWhisper.ts +2 -0
- package/src/version.json +1 -1
- package/whisper-rn.podspec +8 -9
- package/cpp/ggml-metal.m +0 -6779
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml.h
CHANGED
|
@@ -237,6 +237,8 @@
|
|
|
237
237
|
#define WSP_GGML_EXIT_SUCCESS 0
|
|
238
238
|
#define WSP_GGML_EXIT_ABORTED 1
|
|
239
239
|
|
|
240
|
+
// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
|
|
241
|
+
#define WSP_GGML_ROPE_TYPE_NORMAL 0
|
|
240
242
|
#define WSP_GGML_ROPE_TYPE_NEOX 2
|
|
241
243
|
#define WSP_GGML_ROPE_TYPE_MROPE 8
|
|
242
244
|
#define WSP_GGML_ROPE_TYPE_VISION 24
|
|
@@ -244,6 +246,13 @@
|
|
|
244
246
|
#define WSP_GGML_MROPE_SECTIONS 4
|
|
245
247
|
|
|
246
248
|
#define WSP_GGML_UNUSED(x) (void)(x)
|
|
249
|
+
#ifdef __CUDACC__
|
|
250
|
+
template<typename... Args>
|
|
251
|
+
__host__ __device__ constexpr inline void wsp_ggml_unused_vars_impl(Args&&...) noexcept {}
|
|
252
|
+
#define WSP_GGML_UNUSED_VARS(...) wsp_ggml_unused_vars_impl(__VA_ARGS__)
|
|
253
|
+
#else
|
|
254
|
+
#define WSP_GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
|
|
255
|
+
#endif // __CUDACC__
|
|
247
256
|
|
|
248
257
|
#define WSP_GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
|
249
258
|
|
|
@@ -277,19 +286,19 @@
|
|
|
277
286
|
// WSP_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
|
278
287
|
//
|
|
279
288
|
#define WSP_GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
|
280
|
-
const type prefix##0 = (pointer)->array[0]; \
|
|
289
|
+
const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
|
|
281
290
|
WSP_GGML_UNUSED(prefix##0);
|
|
282
291
|
#define WSP_GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
|
283
292
|
WSP_GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
|
284
|
-
const type prefix##1 = (pointer)->array[1]; \
|
|
293
|
+
const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
|
|
285
294
|
WSP_GGML_UNUSED(prefix##1);
|
|
286
295
|
#define WSP_GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
|
287
296
|
WSP_GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
|
288
|
-
const type prefix##2 = (pointer)->array[2]; \
|
|
297
|
+
const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
|
|
289
298
|
WSP_GGML_UNUSED(prefix##2);
|
|
290
299
|
#define WSP_GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
|
291
300
|
WSP_GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
|
292
|
-
const type prefix##3 = (pointer)->array[3]; \
|
|
301
|
+
const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
|
|
293
302
|
WSP_GGML_UNUSED(prefix##3);
|
|
294
303
|
|
|
295
304
|
#define WSP_GGML_TENSOR_UNARY_OP_LOCALS \
|
|
@@ -504,7 +513,9 @@ extern "C" {
|
|
|
504
513
|
WSP_GGML_OP_CONV_TRANSPOSE_1D,
|
|
505
514
|
WSP_GGML_OP_IM2COL,
|
|
506
515
|
WSP_GGML_OP_IM2COL_BACK,
|
|
516
|
+
WSP_GGML_OP_IM2COL_3D,
|
|
507
517
|
WSP_GGML_OP_CONV_2D,
|
|
518
|
+
WSP_GGML_OP_CONV_3D,
|
|
508
519
|
WSP_GGML_OP_CONV_2D_DW,
|
|
509
520
|
WSP_GGML_OP_CONV_TRANSPOSE_2D,
|
|
510
521
|
WSP_GGML_OP_POOL_1D,
|
|
@@ -565,6 +576,11 @@ extern "C" {
|
|
|
565
576
|
WSP_GGML_UNARY_OP_HARDSIGMOID,
|
|
566
577
|
WSP_GGML_UNARY_OP_EXP,
|
|
567
578
|
WSP_GGML_UNARY_OP_GELU_ERF,
|
|
579
|
+
WSP_GGML_UNARY_OP_XIELU,
|
|
580
|
+
WSP_GGML_UNARY_OP_FLOOR,
|
|
581
|
+
WSP_GGML_UNARY_OP_CEIL,
|
|
582
|
+
WSP_GGML_UNARY_OP_ROUND,
|
|
583
|
+
WSP_GGML_UNARY_OP_TRUNC,
|
|
568
584
|
|
|
569
585
|
WSP_GGML_UNARY_OP_COUNT,
|
|
570
586
|
};
|
|
@@ -1139,6 +1155,58 @@ extern "C" {
|
|
|
1139
1155
|
struct wsp_ggml_context * ctx,
|
|
1140
1156
|
struct wsp_ggml_tensor * a);
|
|
1141
1157
|
|
|
1158
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_floor(
|
|
1159
|
+
struct wsp_ggml_context * ctx,
|
|
1160
|
+
struct wsp_ggml_tensor * a);
|
|
1161
|
+
|
|
1162
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_floor_inplace(
|
|
1163
|
+
struct wsp_ggml_context * ctx,
|
|
1164
|
+
struct wsp_ggml_tensor * a);
|
|
1165
|
+
|
|
1166
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_ceil(
|
|
1167
|
+
struct wsp_ggml_context * ctx,
|
|
1168
|
+
struct wsp_ggml_tensor * a);
|
|
1169
|
+
|
|
1170
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_ceil_inplace(
|
|
1171
|
+
struct wsp_ggml_context * ctx,
|
|
1172
|
+
struct wsp_ggml_tensor * a);
|
|
1173
|
+
|
|
1174
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_round(
|
|
1175
|
+
struct wsp_ggml_context * ctx,
|
|
1176
|
+
struct wsp_ggml_tensor * a);
|
|
1177
|
+
|
|
1178
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_round_inplace(
|
|
1179
|
+
struct wsp_ggml_context * ctx,
|
|
1180
|
+
struct wsp_ggml_tensor * a);
|
|
1181
|
+
|
|
1182
|
+
/**
|
|
1183
|
+
* Truncates the fractional part of each element in the tensor (towards zero).
|
|
1184
|
+
* For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
|
|
1185
|
+
* Similar to std::trunc in C/C++.
|
|
1186
|
+
*/
|
|
1187
|
+
|
|
1188
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_trunc(
|
|
1189
|
+
struct wsp_ggml_context * ctx,
|
|
1190
|
+
struct wsp_ggml_tensor * a);
|
|
1191
|
+
|
|
1192
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_trunc_inplace(
|
|
1193
|
+
struct wsp_ggml_context * ctx,
|
|
1194
|
+
struct wsp_ggml_tensor * a);
|
|
1195
|
+
|
|
1196
|
+
|
|
1197
|
+
|
|
1198
|
+
// xIELU activation function
|
|
1199
|
+
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
|
|
1200
|
+
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
|
|
1201
|
+
// that constrain the positive and negative source alpha values respectively
|
|
1202
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_xielu(
|
|
1203
|
+
struct wsp_ggml_context * ctx,
|
|
1204
|
+
struct wsp_ggml_tensor * a,
|
|
1205
|
+
float alpha_n,
|
|
1206
|
+
float alpha_p,
|
|
1207
|
+
float beta,
|
|
1208
|
+
float eps);
|
|
1209
|
+
|
|
1142
1210
|
// gated linear unit ops
|
|
1143
1211
|
// A: n columns, r rows,
|
|
1144
1212
|
// result is n / 2 columns, r rows,
|
|
@@ -1395,6 +1463,7 @@ extern "C" {
|
|
|
1395
1463
|
struct wsp_ggml_tensor * a,
|
|
1396
1464
|
struct wsp_ggml_tensor * b);
|
|
1397
1465
|
|
|
1466
|
+
// note: casting from f32 to i32 will discard the fractional part
|
|
1398
1467
|
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cast(
|
|
1399
1468
|
struct wsp_ggml_context * ctx,
|
|
1400
1469
|
struct wsp_ggml_tensor * a,
|
|
@@ -1519,7 +1588,11 @@ extern "C" {
|
|
|
1519
1588
|
struct wsp_ggml_context * ctx,
|
|
1520
1589
|
struct wsp_ggml_tensor * a);
|
|
1521
1590
|
|
|
1522
|
-
// supports
|
|
1591
|
+
// supports 4D a:
|
|
1592
|
+
// a [n_embd, ne1, ne2, ne3]
|
|
1593
|
+
// b I32 [n_rows, ne2, ne3, 1]
|
|
1594
|
+
//
|
|
1595
|
+
// return [n_embd, n_rows, ne2, ne3]
|
|
1523
1596
|
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_rows(
|
|
1524
1597
|
struct wsp_ggml_context * ctx,
|
|
1525
1598
|
struct wsp_ggml_tensor * a, // data
|
|
@@ -1601,6 +1674,13 @@ extern "C" {
|
|
|
1601
1674
|
float scale,
|
|
1602
1675
|
float max_bias);
|
|
1603
1676
|
|
|
1677
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_inplace(
|
|
1678
|
+
struct wsp_ggml_context * ctx,
|
|
1679
|
+
struct wsp_ggml_tensor * a,
|
|
1680
|
+
struct wsp_ggml_tensor * mask,
|
|
1681
|
+
float scale,
|
|
1682
|
+
float max_bias);
|
|
1683
|
+
|
|
1604
1684
|
WSP_GGML_API void wsp_ggml_soft_max_add_sinks(
|
|
1605
1685
|
struct wsp_ggml_tensor * a,
|
|
1606
1686
|
struct wsp_ggml_tensor * sinks);
|
|
@@ -1862,6 +1942,41 @@ extern "C" {
|
|
|
1862
1942
|
int d0, // dilation dimension 0
|
|
1863
1943
|
int d1); // dilation dimension 1
|
|
1864
1944
|
|
|
1945
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_im2col_3d(
|
|
1946
|
+
struct wsp_ggml_context * ctx,
|
|
1947
|
+
struct wsp_ggml_tensor * a,
|
|
1948
|
+
struct wsp_ggml_tensor * b,
|
|
1949
|
+
int64_t IC,
|
|
1950
|
+
int s0, // stride width
|
|
1951
|
+
int s1, // stride height
|
|
1952
|
+
int s2, // stride depth
|
|
1953
|
+
int p0, // padding width
|
|
1954
|
+
int p1, // padding height
|
|
1955
|
+
int p2, // padding depth
|
|
1956
|
+
int d0, // dilation width
|
|
1957
|
+
int d1, // dilation height
|
|
1958
|
+
int d2, // dilation depth
|
|
1959
|
+
enum wsp_ggml_type dst_type);
|
|
1960
|
+
|
|
1961
|
+
// a: [OC*IC, KD, KH, KW]
|
|
1962
|
+
// b: [N*IC, ID, IH, IW]
|
|
1963
|
+
// result: [N*OC, OD, OH, OW]
|
|
1964
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_3d(
|
|
1965
|
+
struct wsp_ggml_context * ctx,
|
|
1966
|
+
struct wsp_ggml_tensor * a,
|
|
1967
|
+
struct wsp_ggml_tensor * b,
|
|
1968
|
+
int64_t IC,
|
|
1969
|
+
int s0, // stride width
|
|
1970
|
+
int s1, // stride height
|
|
1971
|
+
int s2, // stride depth
|
|
1972
|
+
int p0, // padding width
|
|
1973
|
+
int p1, // padding height
|
|
1974
|
+
int p2, // padding depth
|
|
1975
|
+
int d0, // dilation width
|
|
1976
|
+
int d1, // dilation height
|
|
1977
|
+
int d2 // dilation depth
|
|
1978
|
+
);
|
|
1979
|
+
|
|
1865
1980
|
// kernel size is a->ne[0] x a->ne[1]
|
|
1866
1981
|
// stride is equal to kernel size
|
|
1867
1982
|
// padding is zero
|
|
@@ -1933,6 +2048,23 @@ extern "C" {
|
|
|
1933
2048
|
int d0, // dilation dimension 0
|
|
1934
2049
|
int d1); // dilation dimension 1
|
|
1935
2050
|
|
|
2051
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_3d_direct(
|
|
2052
|
+
struct wsp_ggml_context * ctx,
|
|
2053
|
+
struct wsp_ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
|
2054
|
+
struct wsp_ggml_tensor * b, // input [W, H, D, C * N]
|
|
2055
|
+
int s0, // stride
|
|
2056
|
+
int s1,
|
|
2057
|
+
int s2,
|
|
2058
|
+
int p0, // padding
|
|
2059
|
+
int p1,
|
|
2060
|
+
int p2,
|
|
2061
|
+
int d0, // dilation
|
|
2062
|
+
int d1,
|
|
2063
|
+
int d2,
|
|
2064
|
+
int n_channels,
|
|
2065
|
+
int n_batch,
|
|
2066
|
+
int n_channels_out);
|
|
2067
|
+
|
|
1936
2068
|
enum wsp_ggml_op_pool {
|
|
1937
2069
|
WSP_GGML_OP_POOL_MAX,
|
|
1938
2070
|
WSP_GGML_OP_POOL_AVG,
|
|
@@ -2023,6 +2155,19 @@ extern "C" {
|
|
|
2023
2155
|
int p2,
|
|
2024
2156
|
int p3);
|
|
2025
2157
|
|
|
2158
|
+
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad_ext(
|
|
2159
|
+
struct wsp_ggml_context * ctx,
|
|
2160
|
+
struct wsp_ggml_tensor * a,
|
|
2161
|
+
int lp0,
|
|
2162
|
+
int rp0,
|
|
2163
|
+
int lp1,
|
|
2164
|
+
int rp1,
|
|
2165
|
+
int lp2,
|
|
2166
|
+
int rp2,
|
|
2167
|
+
int lp3,
|
|
2168
|
+
int rp3
|
|
2169
|
+
);
|
|
2170
|
+
|
|
2026
2171
|
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
|
2027
2172
|
WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad_reflect_1d(
|
|
2028
2173
|
struct wsp_ggml_context * ctx,
|
package/cpp/jsi/RNWhisperJSI.cpp
CHANGED
|
@@ -17,6 +17,8 @@ using namespace facebook::jsi;
|
|
|
17
17
|
|
|
18
18
|
namespace rnwhisper_jsi {
|
|
19
19
|
|
|
20
|
+
using namespace facebook::jsi;
|
|
21
|
+
|
|
20
22
|
// Consolidated logging function
|
|
21
23
|
enum class LogLevel { LOG_DEBUG, LOG_INFO, LOG_ERROR };
|
|
22
24
|
|
|
@@ -267,11 +269,13 @@ struct CallbackInfo {
|
|
|
267
269
|
std::shared_ptr<Function> onProgressCallback;
|
|
268
270
|
std::shared_ptr<Function> onNewSegmentsCallback;
|
|
269
271
|
int jobId;
|
|
272
|
+
int nProcessors;
|
|
270
273
|
};
|
|
271
274
|
|
|
272
275
|
CallbackInfo extractCallbacks(Runtime& runtime, const Object& optionsObj) {
|
|
273
276
|
CallbackInfo info;
|
|
274
277
|
info.jobId = rand(); // Default fallback jobId
|
|
278
|
+
info.nProcessors = 1; // Default to 1 processor
|
|
275
279
|
|
|
276
280
|
try {
|
|
277
281
|
auto propNames = optionsObj.getPropertyNames(runtime);
|
|
@@ -286,6 +290,8 @@ CallbackInfo extractCallbacks(Runtime& runtime, const Object& optionsObj) {
|
|
|
286
290
|
info.onNewSegmentsCallback = std::make_shared<Function>(propValue.getObject(runtime).getFunction(runtime));
|
|
287
291
|
} else if (propName == "jobId" && propValue.isNumber()) {
|
|
288
292
|
info.jobId = (int)propValue.getNumber();
|
|
293
|
+
} else if (propName == "nProcessors" && propValue.isNumber()) {
|
|
294
|
+
info.nProcessors = (int)propValue.getNumber();
|
|
289
295
|
}
|
|
290
296
|
}
|
|
291
297
|
} catch (...) {
|
|
@@ -549,12 +555,13 @@ void installJSIBindings(
|
|
|
549
555
|
code = -2;
|
|
550
556
|
} else {
|
|
551
557
|
try {
|
|
552
|
-
|
|
558
|
+
job->n_processors = callbackInfo.nProcessors;
|
|
559
|
+
code = whisper_full_parallel(context, job->params, audioResult.data.data(), audioResult.count, job->n_processors);
|
|
553
560
|
if (job->is_aborted()) {
|
|
554
561
|
code = -999;
|
|
555
562
|
}
|
|
556
563
|
} catch (...) {
|
|
557
|
-
logError("Exception during
|
|
564
|
+
logError("Exception during whisper_full_parallel transcription");
|
|
558
565
|
code = -3;
|
|
559
566
|
}
|
|
560
567
|
rnwhisper::job_remove(callbackInfo.jobId);
|
package/cpp/jsi/ThreadPool.h
CHANGED
|
@@ -18,7 +18,7 @@ public:
|
|
|
18
18
|
ThreadPool(size_t);
|
|
19
19
|
template<class F, class... Args>
|
|
20
20
|
auto enqueue(F&& f, Args&&... args)
|
|
21
|
-
-> std::future<
|
|
21
|
+
-> std::future<std::invoke_result_t<F, Args...>>;
|
|
22
22
|
~ThreadPool();
|
|
23
23
|
private:
|
|
24
24
|
// need to keep track of threads so we can join them
|
|
@@ -63,9 +63,9 @@ inline ThreadPool::ThreadPool(size_t threads)
|
|
|
63
63
|
// add new work item to the pool
|
|
64
64
|
template<class F, class... Args>
|
|
65
65
|
auto ThreadPool::enqueue(F&& f, Args&&... args)
|
|
66
|
-
-> std::future<
|
|
66
|
+
-> std::future<std::invoke_result_t<F, Args...>>
|
|
67
67
|
{
|
|
68
|
-
using return_type =
|
|
68
|
+
using return_type = std::invoke_result_t<F, Args...>;
|
|
69
69
|
|
|
70
70
|
auto task = std::make_shared< std::packaged_task<return_type()> >(
|
|
71
71
|
std::bind(std::forward<F>(f), std::forward<Args>(args)...)
|
package/cpp/rn-whisper.h
CHANGED
package/cpp/whisper.cpp
CHANGED
|
@@ -21,14 +21,12 @@
|
|
|
21
21
|
#define _USE_MATH_DEFINES
|
|
22
22
|
#include <cmath>
|
|
23
23
|
#include <climits>
|
|
24
|
-
#include <codecvt>
|
|
25
24
|
#include <cstdarg>
|
|
26
25
|
#include <cstdio>
|
|
27
26
|
#include <cstring>
|
|
28
27
|
#include <fstream>
|
|
29
28
|
#include <functional>
|
|
30
29
|
#include <map>
|
|
31
|
-
#include <mutex>
|
|
32
30
|
#include <random>
|
|
33
31
|
#include <regex>
|
|
34
32
|
#include <set>
|
|
@@ -36,6 +34,10 @@
|
|
|
36
34
|
#include <thread>
|
|
37
35
|
#include <vector>
|
|
38
36
|
|
|
37
|
+
#ifdef _MSC_VER
|
|
38
|
+
#include <codecvt>
|
|
39
|
+
#endif
|
|
40
|
+
|
|
39
41
|
#if defined(WHISPER_BIG_ENDIAN)
|
|
40
42
|
template<typename T>
|
|
41
43
|
static T byteswap(T value) {
|
|
@@ -138,6 +140,10 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
|
|
|
138
140
|
} while (0)
|
|
139
141
|
|
|
140
142
|
#define WHISPER_MAX_DECODERS 8
|
|
143
|
+
|
|
144
|
+
// temperature below which we condition on past text history
|
|
145
|
+
static constexpr float WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF = 0.5f;
|
|
146
|
+
|
|
141
147
|
#define WHISPER_MAX_NODES 4096
|
|
142
148
|
|
|
143
149
|
static std::string format(const char * fmt, ...) {
|
|
@@ -252,45 +258,6 @@ static void whisper_set_i32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i
|
|
|
252
258
|
*(int32_t *) data = v;
|
|
253
259
|
}
|
|
254
260
|
|
|
255
|
-
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
|
256
|
-
// the idea is to represent the original matrix multiplication:
|
|
257
|
-
//
|
|
258
|
-
// Z = X @ Y
|
|
259
|
-
//
|
|
260
|
-
// with the sum of two matrix multiplications:
|
|
261
|
-
//
|
|
262
|
-
// Z = (X_0 @ Y_0) + (X_1 @ Y_1)
|
|
263
|
-
//
|
|
264
|
-
// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
|
|
265
|
-
// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
|
|
266
|
-
// general-purpose kernels
|
|
267
|
-
//
|
|
268
|
-
static struct wsp_ggml_tensor * wsp_ggml_mul_mat_pad(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * x, struct wsp_ggml_tensor * y, int pad = 32) {
|
|
269
|
-
// use padding only if dimension 0 is at least 8 times larger than the padding
|
|
270
|
-
// else we won't get much benefit from the optimization
|
|
271
|
-
const int n_pad_req = 8;
|
|
272
|
-
|
|
273
|
-
if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
|
|
274
|
-
return wsp_ggml_mul_mat(ctx, x, y);
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
struct wsp_ggml_tensor * x_0 = wsp_ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
|
|
278
|
-
struct wsp_ggml_tensor * x_1 = wsp_ggml_view_3d(ctx, x, x->ne[0]%pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
|
|
279
|
-
|
|
280
|
-
struct wsp_ggml_tensor * y_0 = wsp_ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
|
|
281
|
-
struct wsp_ggml_tensor * y_1 = wsp_ggml_view_3d(ctx, y, y->ne[0]%pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
|
|
282
|
-
|
|
283
|
-
return wsp_ggml_add(ctx,
|
|
284
|
-
wsp_ggml_mul_mat(ctx, x_0, y_0),
|
|
285
|
-
wsp_ggml_mul_mat(ctx, x_1, y_1));
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
// TODO: check if other platforms can benefit from this optimization
|
|
289
|
-
// TODO: CUDA is currently broken - seems wsp_ggml_mul_mat does not handle views correctly
|
|
290
|
-
#if defined(WSP_GGML_USE_METAL)
|
|
291
|
-
#define wsp_ggml_mul_mat wsp_ggml_mul_mat_pad
|
|
292
|
-
#endif
|
|
293
|
-
|
|
294
261
|
// available whisper models
|
|
295
262
|
enum e_model {
|
|
296
263
|
MODEL_UNKNOWN,
|
|
@@ -919,7 +886,10 @@ struct whisper_state {
|
|
|
919
886
|
std::vector<float> logits;
|
|
920
887
|
|
|
921
888
|
std::vector<whisper_segment> result_all;
|
|
922
|
-
|
|
889
|
+
|
|
890
|
+
// prompt history split into static prefix (prompt_past0) and dynamic rolling context (prompt_past1)
|
|
891
|
+
std::vector<whisper_token> prompt_past0; // static carried initial prompt (if enabled)
|
|
892
|
+
std::vector<whisper_token> prompt_past1; // dynamic context from decoded output
|
|
923
893
|
|
|
924
894
|
int lang_id = 0; // english by default
|
|
925
895
|
|
|
@@ -1326,7 +1296,7 @@ static wsp_ggml_backend_t whisper_backend_init_gpu(const whisper_context_params
|
|
|
1326
1296
|
if (params.use_gpu) {
|
|
1327
1297
|
for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
|
|
1328
1298
|
wsp_ggml_backend_dev_t dev_cur = wsp_ggml_backend_dev_get(i);
|
|
1329
|
-
if (wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
1299
|
+
if (wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU || wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_IGPU) {
|
|
1330
1300
|
if (cnt == params.gpu_device) {
|
|
1331
1301
|
dev = dev_cur;
|
|
1332
1302
|
}
|
|
@@ -1395,7 +1365,7 @@ static buft_list_t make_buft_list(whisper_context_params & params) {
|
|
|
1395
1365
|
int cnt = 0;
|
|
1396
1366
|
for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
|
|
1397
1367
|
wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
|
|
1398
|
-
if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
1368
|
+
if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU || wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_IGPU) {
|
|
1399
1369
|
if (cnt == params.gpu_device) {
|
|
1400
1370
|
auto * buft = wsp_ggml_backend_dev_buffer_type(dev);
|
|
1401
1371
|
if (buft) {
|
|
@@ -1433,6 +1403,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
|
|
|
1433
1403
|
bool op_supported = true;
|
|
1434
1404
|
|
|
1435
1405
|
if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU ||
|
|
1406
|
+
wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_IGPU ||
|
|
1436
1407
|
(wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_CPU && buft == wsp_ggml_backend_cpu_buffer_type())) {
|
|
1437
1408
|
// GPU and default CPU backend support all operators
|
|
1438
1409
|
op_supported = true;
|
|
@@ -3635,7 +3606,7 @@ struct whisper_context_params whisper_context_default_params() {
|
|
|
3635
3606
|
struct whisper_context_params result = {
|
|
3636
3607
|
/*.use_gpu =*/ true,
|
|
3637
3608
|
/*.use_coreml =*/ false,
|
|
3638
|
-
/*.flash_attn =*/
|
|
3609
|
+
/*.flash_attn =*/ true,
|
|
3639
3610
|
/*.gpu_device =*/ 0,
|
|
3640
3611
|
|
|
3641
3612
|
/*.dtw_token_timestamps =*/ false,
|
|
@@ -4489,6 +4460,7 @@ static bool weight_buft_supported(const whisper_vad_hparams & hparams, wsp_ggml_
|
|
|
4489
4460
|
bool op_supported = true;
|
|
4490
4461
|
|
|
4491
4462
|
if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU ||
|
|
4463
|
+
wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_IGPU ||
|
|
4492
4464
|
(wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_CPU && buft == wsp_ggml_backend_cpu_buffer_type())) {
|
|
4493
4465
|
// GPU and default CPU backend support all operators
|
|
4494
4466
|
op_supported = true;
|
|
@@ -4719,6 +4691,7 @@ static bool whisper_vad_init_context(whisper_vad_context * vctx) {
|
|
|
4719
4691
|
wsp_ggml_set_name(vctx->c_state, "c_state");
|
|
4720
4692
|
|
|
4721
4693
|
vctx->buffer = wsp_ggml_backend_alloc_ctx_tensors(ctx, vctx->backends[0]);
|
|
4694
|
+
wsp_ggml_free(ctx);
|
|
4722
4695
|
if (!vctx->buffer) {
|
|
4723
4696
|
WHISPER_LOG_ERROR("%s: failed to allocate memory for the VAD state\n", __func__);
|
|
4724
4697
|
return false;
|
|
@@ -5463,6 +5436,9 @@ struct whisper_vad_segments * whisper_vad_segments_from_samples(
|
|
|
5463
5436
|
|
|
5464
5437
|
void whisper_vad_free(whisper_vad_context * ctx) {
|
|
5465
5438
|
if (ctx) {
|
|
5439
|
+
if (ctx->buffer) {
|
|
5440
|
+
wsp_ggml_backend_buffer_free(ctx->buffer);
|
|
5441
|
+
}
|
|
5466
5442
|
for (wsp_ggml_context * context : ctx->model.ctxs) {
|
|
5467
5443
|
wsp_ggml_free(context);
|
|
5468
5444
|
}
|
|
@@ -5477,6 +5453,9 @@ void whisper_vad_free(whisper_vad_context * ctx) {
|
|
|
5477
5453
|
wsp_ggml_backend_free(backend);
|
|
5478
5454
|
}
|
|
5479
5455
|
|
|
5456
|
+
delete[] ctx->model.hparams.encoder_in_channels;
|
|
5457
|
+
delete[] ctx->model.hparams.encoder_out_channels;
|
|
5458
|
+
delete[] ctx->model.hparams.kernel_sizes;
|
|
5480
5459
|
|
|
5481
5460
|
delete ctx;
|
|
5482
5461
|
}
|
|
@@ -5956,9 +5935,10 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
5956
5935
|
|
|
5957
5936
|
/* suppress_regex =*/ nullptr,
|
|
5958
5937
|
|
|
5959
|
-
/*.initial_prompt
|
|
5960
|
-
/*.
|
|
5961
|
-
/*.
|
|
5938
|
+
/*.initial_prompt =*/ nullptr,
|
|
5939
|
+
/*.carry_initial_prompt =*/ false,
|
|
5940
|
+
/*.prompt_tokens =*/ nullptr,
|
|
5941
|
+
/*.prompt_n_tokens =*/ 0,
|
|
5962
5942
|
|
|
5963
5943
|
/*.language =*/ "en",
|
|
5964
5944
|
/*.detect_language =*/ false,
|
|
@@ -6654,6 +6634,10 @@ static bool whisper_vad(
|
|
|
6654
6634
|
|
|
6655
6635
|
whisper_vad_segments * vad_segments = whisper_vad_segments_from_samples(vctx, vad_params, samples, n_samples);
|
|
6656
6636
|
|
|
6637
|
+
if (!vad_segments) {
|
|
6638
|
+
return false;
|
|
6639
|
+
}
|
|
6640
|
+
|
|
6657
6641
|
if (vad_segments->data.size() > 0) {
|
|
6658
6642
|
state->has_vad_segments = true;
|
|
6659
6643
|
ctx->state->vad_segments.clear();
|
|
@@ -6696,7 +6680,6 @@ static bool whisper_vad(
|
|
|
6696
6680
|
} catch (const std::bad_alloc & /* e */) {
|
|
6697
6681
|
WHISPER_LOG_ERROR("%s: failed to allocate memory for filtered samples\n", __func__);
|
|
6698
6682
|
whisper_vad_free_segments(vad_segments);
|
|
6699
|
-
whisper_vad_free(vctx);
|
|
6700
6683
|
return false;
|
|
6701
6684
|
}
|
|
6702
6685
|
|
|
@@ -6802,6 +6785,7 @@ static bool whisper_vad(
|
|
|
6802
6785
|
__func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
|
|
6803
6786
|
}
|
|
6804
6787
|
|
|
6788
|
+
whisper_vad_free_segments(vad_segments);
|
|
6805
6789
|
return true;
|
|
6806
6790
|
}
|
|
6807
6791
|
|
|
@@ -6910,17 +6894,22 @@ int whisper_full_with_state(
|
|
|
6910
6894
|
decoder.rng = std::mt19937(j);
|
|
6911
6895
|
}
|
|
6912
6896
|
|
|
6913
|
-
// the accumulated text context
|
|
6914
|
-
auto &
|
|
6897
|
+
// the accumulated text context split into static (prompt_past0) and dynamic (prompt_past1)
|
|
6898
|
+
auto & prompt_past0 = state->prompt_past0;
|
|
6899
|
+
auto & prompt_past1 = state->prompt_past1;
|
|
6915
6900
|
if (params.no_context) {
|
|
6916
|
-
|
|
6901
|
+
prompt_past0.clear();
|
|
6902
|
+
prompt_past1.clear();
|
|
6917
6903
|
}
|
|
6918
6904
|
|
|
6905
|
+
// calculate the maximum context budget for prompt history
|
|
6906
|
+
const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
|
|
6907
|
+
|
|
6919
6908
|
// prepare prompt
|
|
6920
6909
|
{
|
|
6921
6910
|
std::vector<whisper_token> prompt_tokens;
|
|
6922
6911
|
|
|
6923
|
-
// initial prompt
|
|
6912
|
+
// tokenize the initial prompt
|
|
6924
6913
|
if (!params.prompt_tokens && params.initial_prompt) {
|
|
6925
6914
|
prompt_tokens.resize(1024);
|
|
6926
6915
|
int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size());
|
|
@@ -6932,14 +6921,25 @@ int whisper_full_with_state(
|
|
|
6932
6921
|
params.prompt_tokens = prompt_tokens.data();
|
|
6933
6922
|
params.prompt_n_tokens = prompt_tokens.size();
|
|
6934
6923
|
}
|
|
6935
|
-
|
|
6936
|
-
// prepend the prompt tokens to the prompt_past
|
|
6937
6924
|
if (params.prompt_tokens && params.prompt_n_tokens > 0) {
|
|
6938
|
-
|
|
6939
|
-
|
|
6940
|
-
|
|
6925
|
+
if (params.carry_initial_prompt) {
|
|
6926
|
+
if (prompt_past0.empty()) {
|
|
6927
|
+
const int max_tokens = std::max(1, max_prompt_ctx - 1);
|
|
6928
|
+
|
|
6929
|
+
if (params.prompt_n_tokens > max_tokens) {
|
|
6930
|
+
WHISPER_LOG_WARN("%s: initial prompt is too long (%d tokens), will use only the last %d tokens\n",
|
|
6931
|
+
__func__, params.prompt_n_tokens, max_tokens);
|
|
6932
|
+
}
|
|
6933
|
+
|
|
6934
|
+
const int n_tokens = std::min(params.prompt_n_tokens, max_tokens);
|
|
6935
|
+
prompt_past0.assign(params.prompt_tokens + (params.prompt_n_tokens - n_tokens), params.prompt_tokens + params.prompt_n_tokens);
|
|
6936
|
+
}
|
|
6937
|
+
} else {
|
|
6938
|
+
for (int i = 0; i < params.prompt_n_tokens; ++i) {
|
|
6939
|
+
prompt_past1.push_back(params.prompt_tokens[i]);
|
|
6940
|
+
}
|
|
6941
|
+
std::rotate(prompt_past1.begin(), prompt_past1.end() - params.prompt_n_tokens, prompt_past1.end());
|
|
6941
6942
|
}
|
|
6942
|
-
std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
|
|
6943
6943
|
}
|
|
6944
6944
|
}
|
|
6945
6945
|
|
|
@@ -7025,7 +7025,8 @@ int whisper_full_with_state(
|
|
|
7025
7025
|
// if there is a very short audio segment left to process, we remove any past prompt since it tends
|
|
7026
7026
|
// to confuse the decoder and often make it repeat or hallucinate stuff
|
|
7027
7027
|
if (seek > seek_start && seek + 500 >= seek_end) {
|
|
7028
|
-
|
|
7028
|
+
prompt_past0.clear();
|
|
7029
|
+
prompt_past1.clear();
|
|
7029
7030
|
}
|
|
7030
7031
|
|
|
7031
7032
|
int best_decoder_id = 0;
|
|
@@ -7086,12 +7087,25 @@ int whisper_full_with_state(
|
|
|
7086
7087
|
{
|
|
7087
7088
|
prompt.clear();
|
|
7088
7089
|
|
|
7089
|
-
|
|
7090
|
-
|
|
7091
|
-
|
|
7090
|
+
if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
|
|
7091
|
+
const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty();
|
|
7092
|
+
const bool can_take1 = !prompt_past1.empty();
|
|
7092
7093
|
|
|
7093
|
-
|
|
7094
|
-
|
|
7094
|
+
if (max_prompt_ctx > 0 && (can_take0 || can_take1)) {
|
|
7095
|
+
// Always start with previous token marker to connect continuity
|
|
7096
|
+
prompt.push_back(whisper_token_prev(ctx));
|
|
7097
|
+
|
|
7098
|
+
// Take static tokens (initial prompt) first
|
|
7099
|
+
int n_take0 = 0;
|
|
7100
|
+
if (can_take0) {
|
|
7101
|
+
n_take0 = prompt_past0.size();
|
|
7102
|
+
prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end());
|
|
7103
|
+
}
|
|
7104
|
+
|
|
7105
|
+
// Fill remaining budget with dynamic tokens (rolling context)
|
|
7106
|
+
const int n_take1 = std::min<int>(max_prompt_ctx - n_take0 - 1, prompt_past1.size());
|
|
7107
|
+
prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
|
|
7108
|
+
}
|
|
7095
7109
|
}
|
|
7096
7110
|
|
|
7097
7111
|
// init new transcription with sot, language (opt) and task tokens
|
|
@@ -7573,14 +7587,17 @@ int whisper_full_with_state(
|
|
|
7573
7587
|
|
|
7574
7588
|
//WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
|
|
7575
7589
|
|
|
7576
|
-
// update
|
|
7577
|
-
|
|
7578
|
-
if (prompt.front() == whisper_token_prev(ctx)) {
|
|
7579
|
-
|
|
7590
|
+
// update prompt_past1
|
|
7591
|
+
prompt_past1.clear();
|
|
7592
|
+
if (!params.carry_initial_prompt && !prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
|
|
7593
|
+
prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
|
|
7580
7594
|
}
|
|
7581
7595
|
|
|
7582
|
-
|
|
7583
|
-
|
|
7596
|
+
// Add newly decoded tokens to the rolling context
|
|
7597
|
+
if (!is_no_speech) {
|
|
7598
|
+
for (int i = 0; i < result_len; ++i) {
|
|
7599
|
+
prompt_past1.push_back(tokens_cur[i].id);
|
|
7600
|
+
}
|
|
7584
7601
|
}
|
|
7585
7602
|
|
|
7586
7603
|
if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) {
|
|
@@ -8952,7 +8969,7 @@ void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data) {
|
|
|
8952
8969
|
}
|
|
8953
8970
|
|
|
8954
8971
|
const char * whisper_version(void) {
|
|
8955
|
-
return "1.
|
|
8972
|
+
return "1.8.0";
|
|
8956
8973
|
}
|
|
8957
8974
|
|
|
8958
8975
|
WSP_GGML_ATTRIBUTE_FORMAT(2, 3)
|
package/cpp/whisper.h
CHANGED
|
@@ -526,6 +526,7 @@ extern "C" {
|
|
|
526
526
|
// use whisper_tokenize() to convert text to tokens
|
|
527
527
|
// maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
|
|
528
528
|
const char * initial_prompt;
|
|
529
|
+
bool carry_initial_prompt; // if true, always prepend initial_prompt to every decode window (may reduce conditioning on previous text)
|
|
529
530
|
const whisper_token * prompt_tokens;
|
|
530
531
|
int prompt_n_tokens;
|
|
531
532
|
|