whisper.rn 0.4.0-rc.10 → 0.4.0-rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +9 -3
- package/cpp/ggml-alloc.c +6 -14
- package/cpp/ggml-backend-impl.h +50 -11
- package/cpp/ggml-backend-reg.cpp +409 -31
- package/cpp/ggml-backend.cpp +9 -3
- package/cpp/ggml-backend.h +18 -0
- package/cpp/ggml-common.h +41 -43
- package/cpp/ggml-cpp.h +1 -0
- package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +941 -254
- package/cpp/ggml-cpu-aarch64.h +2 -24
- package/cpp/ggml-cpu-impl.h +171 -11
- package/cpp/ggml-cpu-quants.c +1812 -389
- package/cpp/ggml-cpu-traits.cpp +36 -0
- package/cpp/ggml-cpu-traits.h +38 -0
- package/cpp/ggml-cpu.c +1432 -610
- package/cpp/ggml-cpu.cpp +131 -141
- package/cpp/ggml-cpu.h +10 -50
- package/cpp/ggml-impl.h +27 -11
- package/cpp/ggml-metal-impl.h +39 -0
- package/cpp/ggml-metal.h +1 -1
- package/cpp/ggml-metal.m +1031 -359
- package/cpp/ggml-opt.cpp +854 -0
- package/cpp/ggml-opt.h +216 -0
- package/cpp/ggml-quants.c +0 -9
- package/cpp/ggml-threading.h +4 -2
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +501 -1537
- package/cpp/ggml.h +144 -171
- package/cpp/gguf.cpp +1329 -0
- package/cpp/gguf.h +202 -0
- package/cpp/whisper.cpp +254 -114
- package/cpp/whisper.h +6 -3
- package/lib/commonjs/version.json +1 -1
- package/lib/module/version.json +1 -1
- package/package.json +1 -1
- package/src/version.json +1 -1
- package/whisper-rn.podspec +2 -2
- package/cpp/README.md +0 -4
- package/cpp/ggml-aarch64.c +0 -129
- package/cpp/ggml-aarch64.h +0 -19
- package/cpp/ggml-backend.cpp.rej +0 -12
package/cpp/whisper.cpp
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
#include "whisper.h"
|
|
2
2
|
|
|
3
|
-
#include "ggml-cpu.h"
|
|
4
|
-
|
|
5
3
|
#include "ggml.h"
|
|
4
|
+
#include "ggml-cpp.h"
|
|
6
5
|
#include "ggml-alloc.h"
|
|
7
6
|
#include "ggml-backend.h"
|
|
8
7
|
|
|
@@ -19,35 +18,38 @@
|
|
|
19
18
|
#include <cassert>
|
|
20
19
|
#define _USE_MATH_DEFINES
|
|
21
20
|
#include <cmath>
|
|
22
|
-
#include <
|
|
21
|
+
#include <codecvt>
|
|
23
22
|
#include <cstdarg>
|
|
23
|
+
#include <cstdio>
|
|
24
24
|
#include <cstring>
|
|
25
25
|
#include <fstream>
|
|
26
|
+
#include <functional>
|
|
26
27
|
#include <map>
|
|
28
|
+
#include <mutex>
|
|
29
|
+
#include <random>
|
|
30
|
+
#include <regex>
|
|
27
31
|
#include <set>
|
|
28
32
|
#include <string>
|
|
29
33
|
#include <thread>
|
|
30
34
|
#include <vector>
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
#include <functional>
|
|
34
|
-
#include <codecvt>
|
|
35
|
+
|
|
36
|
+
// dummy
|
|
35
37
|
|
|
36
38
|
#if defined(_MSC_VER)
|
|
37
39
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
38
40
|
#endif
|
|
39
41
|
|
|
40
|
-
#if defined(
|
|
41
|
-
#include <bit>
|
|
42
|
-
|
|
42
|
+
#if defined(WHISPER_BIG_ENDIAN)
|
|
43
43
|
template<typename T>
|
|
44
44
|
static T byteswap(T value) {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
45
|
+
T value_swapped;
|
|
46
|
+
char * source = reinterpret_cast<char *>(&value);
|
|
47
|
+
char * target = reinterpret_cast<char *>(&value_swapped);
|
|
48
|
+
int size = sizeof(T);
|
|
49
|
+
for (int i = 0; i < size; i++) {
|
|
50
|
+
target[size - 1 - i] = source[i];
|
|
51
|
+
}
|
|
52
|
+
return value_swapped;
|
|
51
53
|
}
|
|
52
54
|
|
|
53
55
|
template<typename T>
|
|
@@ -83,14 +85,14 @@ static void byteswap_tensor(wsp_ggml_tensor * tensor) {
|
|
|
83
85
|
}
|
|
84
86
|
|
|
85
87
|
#define BYTESWAP_VALUE(d) d = byteswap(d)
|
|
86
|
-
#define BYTESWAP_FILTERS(f)
|
|
88
|
+
#define BYTESWAP_FILTERS(f) \
|
|
87
89
|
do { \
|
|
88
90
|
for (auto & datum : f.data) { \
|
|
89
91
|
datum = byteswap(datum); \
|
|
90
92
|
} \
|
|
91
93
|
} while (0)
|
|
92
|
-
#define BYTESWAP_TENSOR(t)
|
|
93
|
-
do {
|
|
94
|
+
#define BYTESWAP_TENSOR(t) \
|
|
95
|
+
do { \
|
|
94
96
|
byteswap_tensor(t); \
|
|
95
97
|
} while (0)
|
|
96
98
|
#else
|
|
@@ -147,21 +149,25 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
|
|
|
147
149
|
|
|
148
150
|
static bool wsp_ggml_graph_compute_helper(
|
|
149
151
|
struct wsp_ggml_cgraph * graph,
|
|
150
|
-
std::vector<uint8_t> & buf,
|
|
151
152
|
int n_threads,
|
|
152
153
|
wsp_ggml_abort_callback abort_callback,
|
|
153
154
|
void * abort_callback_data) {
|
|
154
|
-
struct wsp_ggml_cplan plan = wsp_ggml_graph_plan(graph, n_threads, nullptr);
|
|
155
155
|
|
|
156
|
-
|
|
157
|
-
plan.abort_callback_data = abort_callback_data;
|
|
156
|
+
wsp_ggml_backend_ptr backend { wsp_ggml_backend_init_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
|
|
158
157
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
158
|
+
auto * reg = wsp_ggml_backend_dev_backend_reg(wsp_ggml_backend_get_device(backend.get()));
|
|
159
|
+
|
|
160
|
+
auto * set_abort_callback_fn = (wsp_ggml_backend_set_abort_callback_t) wsp_ggml_backend_reg_get_proc_address(reg, "wsp_ggml_backend_set_abort_callback");
|
|
161
|
+
if (set_abort_callback_fn) {
|
|
162
|
+
set_abort_callback_fn(backend.get(), abort_callback, abort_callback_data);
|
|
162
163
|
}
|
|
163
164
|
|
|
164
|
-
|
|
165
|
+
auto wsp_ggml_backend_set_n_threads_fn = (wsp_ggml_backend_set_n_threads_t) wsp_ggml_backend_reg_get_proc_address(reg, "wsp_ggml_backend_set_n_threads");
|
|
166
|
+
if (wsp_ggml_backend_set_n_threads_fn) {
|
|
167
|
+
wsp_ggml_backend_set_n_threads_fn(backend.get(), n_threads);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return wsp_ggml_backend_graph_compute(backend.get(), graph) == WSP_GGML_STATUS_SUCCESS;
|
|
165
171
|
}
|
|
166
172
|
|
|
167
173
|
static bool wsp_ggml_graph_compute_helper(
|
|
@@ -185,6 +191,61 @@ static bool wsp_ggml_graph_compute_helper(
|
|
|
185
191
|
return t;
|
|
186
192
|
}
|
|
187
193
|
|
|
194
|
+
static void whisper_load_backends() {
|
|
195
|
+
#ifdef WSP_GGML_BACKEND_DL
|
|
196
|
+
static std::once_flag flag;
|
|
197
|
+
std::call_once(flag, []() {
|
|
198
|
+
wsp_ggml_backend_load_all();
|
|
199
|
+
});
|
|
200
|
+
#endif
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// TODO: move these functions to ggml-base with support for ggml-backend?
|
|
204
|
+
|
|
205
|
+
static wsp_ggml_tensor * whisper_set_f32(struct wsp_ggml_tensor * t, float v) {
|
|
206
|
+
WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_F32);
|
|
207
|
+
WSP_GGML_ASSERT(wsp_ggml_is_contiguous(t));
|
|
208
|
+
size_t nels = wsp_ggml_nelements(t);
|
|
209
|
+
for (int64_t i = 0; i < nels; ++i) {
|
|
210
|
+
((float *) t->data)[i] = v;
|
|
211
|
+
}
|
|
212
|
+
return t;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
static wsp_ggml_tensor * whisper_set_i32(struct wsp_ggml_tensor * t, int32_t v) {
|
|
216
|
+
WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_I32);
|
|
217
|
+
WSP_GGML_ASSERT(wsp_ggml_is_contiguous(t));
|
|
218
|
+
size_t nels = wsp_ggml_nelements(t);
|
|
219
|
+
for (int64_t i = 0; i < nels; ++i) {
|
|
220
|
+
((int32_t *) t->data)[i] = v;
|
|
221
|
+
}
|
|
222
|
+
return t;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
static float whisper_get_f32_nd(const struct wsp_ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
|
226
|
+
WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_F32);
|
|
227
|
+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
|
|
228
|
+
return *(float *) data;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
static void whisper_set_f32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
|
|
232
|
+
WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_F32);
|
|
233
|
+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
|
|
234
|
+
*(float *) data = v;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
static int32_t whisper_get_i32_nd(const struct wsp_ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
|
238
|
+
WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_I32);
|
|
239
|
+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
|
|
240
|
+
return *(int32_t *) data;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
static void whisper_set_i32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
|
|
244
|
+
WSP_GGML_ASSERT(t->type == WSP_GGML_TYPE_I32);
|
|
245
|
+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
|
|
246
|
+
*(int32_t *) data = v;
|
|
247
|
+
}
|
|
248
|
+
|
|
188
249
|
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
|
189
250
|
// the idea is to represent the original matrix multiplication:
|
|
190
251
|
//
|
|
@@ -428,6 +489,7 @@ struct whisper_segment {
|
|
|
428
489
|
int64_t t1;
|
|
429
490
|
|
|
430
491
|
std::string text;
|
|
492
|
+
float no_speech_prob;
|
|
431
493
|
|
|
432
494
|
std::vector<whisper_token_data> tokens;
|
|
433
495
|
|
|
@@ -867,6 +929,7 @@ struct whisper_state {
|
|
|
867
929
|
whisper_token tid_last;
|
|
868
930
|
|
|
869
931
|
std::vector<float> energy; // PCM signal energy
|
|
932
|
+
float no_speech_prob = 0.0f;
|
|
870
933
|
|
|
871
934
|
// [EXPERIMENTAL] Token-level timestamps with DTW
|
|
872
935
|
whisper_aheads_masks aheads_masks;
|
|
@@ -1233,21 +1296,38 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
|
|
|
1233
1296
|
static wsp_ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
|
|
1234
1297
|
wsp_ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
|
|
1235
1298
|
|
|
1299
|
+
whisper_load_backends();
|
|
1300
|
+
|
|
1301
|
+
wsp_ggml_backend_dev_t dev = nullptr;
|
|
1302
|
+
|
|
1303
|
+
int cnt = 0;
|
|
1236
1304
|
if (params.use_gpu) {
|
|
1237
1305
|
for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
|
|
1238
|
-
wsp_ggml_backend_dev_t
|
|
1239
|
-
if (wsp_ggml_backend_dev_type(
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1306
|
+
wsp_ggml_backend_dev_t dev_cur = wsp_ggml_backend_dev_get(i);
|
|
1307
|
+
if (wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
1308
|
+
if (cnt == 0 || cnt == params.gpu_device) {
|
|
1309
|
+
dev = dev_cur;
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
if (++cnt > params.gpu_device) {
|
|
1313
|
+
break;
|
|
1244
1314
|
}
|
|
1245
|
-
return result;
|
|
1246
1315
|
}
|
|
1247
1316
|
}
|
|
1248
1317
|
}
|
|
1249
1318
|
|
|
1250
|
-
|
|
1319
|
+
if (dev == nullptr) {
|
|
1320
|
+
WHISPER_LOG_INFO("%s: no GPU found\n", __func__);
|
|
1321
|
+
return nullptr;
|
|
1322
|
+
}
|
|
1323
|
+
|
|
1324
|
+
WHISPER_LOG_INFO("%s: using %s backend\n", __func__, wsp_ggml_backend_dev_name(dev));
|
|
1325
|
+
wsp_ggml_backend_t result = wsp_ggml_backend_dev_init(dev, nullptr);
|
|
1326
|
+
if (!result) {
|
|
1327
|
+
WHISPER_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, wsp_ggml_backend_dev_name(dev));
|
|
1328
|
+
}
|
|
1329
|
+
|
|
1330
|
+
return result;
|
|
1251
1331
|
}
|
|
1252
1332
|
|
|
1253
1333
|
static std::vector<wsp_ggml_backend_t> whisper_backend_init(const whisper_context_params & params) {
|
|
@@ -1275,26 +1355,33 @@ static std::vector<wsp_ggml_backend_t> whisper_backend_init(const whisper_contex
|
|
|
1275
1355
|
|
|
1276
1356
|
WSP_GGML_UNUSED(params);
|
|
1277
1357
|
|
|
1278
|
-
result.push_back(
|
|
1358
|
+
result.push_back(wsp_ggml_backend_init_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
|
|
1279
1359
|
|
|
1280
1360
|
return result;
|
|
1281
1361
|
}
|
|
1282
1362
|
|
|
1283
1363
|
static wsp_ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_context_params & params) {
|
|
1364
|
+
wsp_ggml_backend_buffer_type_t result = wsp_ggml_backend_cpu_buffer_type();
|
|
1365
|
+
|
|
1284
1366
|
if (!params.use_gpu) {
|
|
1285
|
-
return
|
|
1367
|
+
return result;
|
|
1286
1368
|
}
|
|
1287
1369
|
|
|
1288
|
-
|
|
1370
|
+
int cnt = 0;
|
|
1289
1371
|
for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
|
|
1290
1372
|
wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
|
|
1291
1373
|
if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
1292
|
-
|
|
1293
|
-
|
|
1374
|
+
if (cnt == 0 || cnt == params.gpu_device) {
|
|
1375
|
+
result = wsp_ggml_backend_dev_buffer_type(dev);
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1378
|
+
if (++cnt > params.gpu_device) {
|
|
1379
|
+
break;
|
|
1380
|
+
}
|
|
1294
1381
|
}
|
|
1295
1382
|
}
|
|
1296
1383
|
|
|
1297
|
-
return
|
|
1384
|
+
return result;
|
|
1298
1385
|
}
|
|
1299
1386
|
|
|
1300
1387
|
// load the model from a ggml file
|
|
@@ -4184,22 +4271,28 @@ static int whisper_has_openvino(void) {
|
|
|
4184
4271
|
const char * whisper_print_system_info(void) {
|
|
4185
4272
|
static std::string s;
|
|
4186
4273
|
|
|
4274
|
+
whisper_load_backends();
|
|
4275
|
+
|
|
4187
4276
|
s = "";
|
|
4188
|
-
s += "
|
|
4189
|
-
s += "AVX2 = " + std::to_string(wsp_ggml_cpu_has_avx2()) + " | ";
|
|
4190
|
-
s += "AVX512 = " + std::to_string(wsp_ggml_cpu_has_avx512()) + " | ";
|
|
4191
|
-
s += "FMA = " + std::to_string(wsp_ggml_cpu_has_fma()) + " | ";
|
|
4192
|
-
s += "NEON = " + std::to_string(wsp_ggml_cpu_has_neon()) + " | ";
|
|
4193
|
-
s += "ARM_FMA = " + std::to_string(wsp_ggml_cpu_has_arm_fma()) + " | ";
|
|
4194
|
-
s += "F16C = " + std::to_string(wsp_ggml_cpu_has_f16c()) + " | ";
|
|
4195
|
-
s += "FP16_VA = " + std::to_string(wsp_ggml_cpu_has_fp16_va()) + " | ";
|
|
4196
|
-
s += "WASM_SIMD = " + std::to_string(wsp_ggml_cpu_has_wasm_simd()) + " | ";
|
|
4197
|
-
s += "SSE3 = " + std::to_string(wsp_ggml_cpu_has_sse3()) + " | ";
|
|
4198
|
-
s += "SSSE3 = " + std::to_string(wsp_ggml_cpu_has_ssse3()) + " | ";
|
|
4199
|
-
s += "VSX = " + std::to_string(wsp_ggml_cpu_has_vsx()) + " | ";
|
|
4277
|
+
s += "WHISPER : ";
|
|
4200
4278
|
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
|
4201
4279
|
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
|
|
4202
4280
|
|
|
4281
|
+
for (size_t i = 0; i < wsp_ggml_backend_reg_count(); i++) {
|
|
4282
|
+
auto * reg = wsp_ggml_backend_reg_get(i);
|
|
4283
|
+
auto * get_features_fn = (wsp_ggml_backend_get_features_t) wsp_ggml_backend_reg_get_proc_address(reg, "wsp_ggml_backend_get_features");
|
|
4284
|
+
if (get_features_fn) {
|
|
4285
|
+
wsp_ggml_backend_feature * features = get_features_fn(reg);
|
|
4286
|
+
s += wsp_ggml_backend_reg_name(reg);
|
|
4287
|
+
s += " : ";
|
|
4288
|
+
for (; features->name; features++) {
|
|
4289
|
+
s += features->name;
|
|
4290
|
+
s += " = ";
|
|
4291
|
+
s += features->value;
|
|
4292
|
+
s += " | ";
|
|
4293
|
+
}
|
|
4294
|
+
}
|
|
4295
|
+
}
|
|
4203
4296
|
return s.c_str();
|
|
4204
4297
|
}
|
|
4205
4298
|
|
|
@@ -4679,7 +4772,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
4679
4772
|
/*.detect_language =*/ false,
|
|
4680
4773
|
|
|
4681
4774
|
/*.suppress_blank =*/ true,
|
|
4682
|
-
/*.
|
|
4775
|
+
/*.suppress_nst =*/ false,
|
|
4683
4776
|
|
|
4684
4777
|
/*.temperature =*/ 0.0f,
|
|
4685
4778
|
/*.max_initial_ts =*/ 1.0f,
|
|
@@ -4829,6 +4922,42 @@ static const std::vector<std::string> non_speech_tokens = {
|
|
|
4829
4922
|
"♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
|
|
4830
4923
|
};
|
|
4831
4924
|
|
|
4925
|
+
static void whisper_compute_logprobs(
|
|
4926
|
+
const std::vector<float> & logits,
|
|
4927
|
+
const int n_logits,
|
|
4928
|
+
std::vector<float> & logprobs) {
|
|
4929
|
+
const float logit_max = *std::max_element(logits.begin(), logits.end());
|
|
4930
|
+
float logsumexp = 0.0f;
|
|
4931
|
+
for (int i = 0; i < n_logits; ++i) {
|
|
4932
|
+
if (logits[i] > -INFINITY) {
|
|
4933
|
+
logsumexp += expf(logits[i] - logit_max);
|
|
4934
|
+
}
|
|
4935
|
+
}
|
|
4936
|
+
logsumexp = logf(logsumexp) + logit_max;
|
|
4937
|
+
|
|
4938
|
+
for (int i = 0; i < n_logits; ++i) {
|
|
4939
|
+
if (logits[i] > -INFINITY) {
|
|
4940
|
+
logprobs[i] = logits[i] - logsumexp;
|
|
4941
|
+
} else {
|
|
4942
|
+
logprobs[i] = -INFINITY;
|
|
4943
|
+
}
|
|
4944
|
+
}
|
|
4945
|
+
}
|
|
4946
|
+
|
|
4947
|
+
static void whisper_compute_probs(
|
|
4948
|
+
const std::vector<float> & logits,
|
|
4949
|
+
const int n_logits,
|
|
4950
|
+
const std::vector<float> & logprobs,
|
|
4951
|
+
std::vector<float> & probs) {
|
|
4952
|
+
for (int i = 0; i < n_logits; ++i) {
|
|
4953
|
+
if (logits[i] == -INFINITY) {
|
|
4954
|
+
probs[i] = 0.0f;
|
|
4955
|
+
} else {
|
|
4956
|
+
probs[i] = expf(logprobs[i]);
|
|
4957
|
+
}
|
|
4958
|
+
}
|
|
4959
|
+
}
|
|
4960
|
+
|
|
4832
4961
|
// process the logits for the selected decoder
|
|
4833
4962
|
// - applies logit filters
|
|
4834
4963
|
// - computes logprobs and probs
|
|
@@ -4890,7 +5019,7 @@ static void whisper_process_logits(
|
|
|
4890
5019
|
|
|
4891
5020
|
// suppress sot and nosp tokens
|
|
4892
5021
|
logits[vocab.token_sot] = -INFINITY;
|
|
4893
|
-
logits[vocab.token_nosp] = -INFINITY;
|
|
5022
|
+
logits[vocab.token_nosp] = -INFINITY;
|
|
4894
5023
|
|
|
4895
5024
|
// [TDRZ] when tinydiarize is disabled, suppress solm token
|
|
4896
5025
|
if (params.tdrz_enable == false) {
|
|
@@ -4927,7 +5056,7 @@ static void whisper_process_logits(
|
|
|
4927
5056
|
|
|
4928
5057
|
// suppress non-speech tokens
|
|
4929
5058
|
// ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
|
|
4930
|
-
if (params.
|
|
5059
|
+
if (params.suppress_nst) {
|
|
4931
5060
|
for (const std::string & token : non_speech_tokens) {
|
|
4932
5061
|
const std::string suppress_tokens[] = {token, " " + token};
|
|
4933
5062
|
for (const std::string & suppress_token : suppress_tokens) {
|
|
@@ -4989,24 +5118,7 @@ static void whisper_process_logits(
|
|
|
4989
5118
|
}
|
|
4990
5119
|
|
|
4991
5120
|
// populate the logprobs array (log_softmax)
|
|
4992
|
-
|
|
4993
|
-
const float logit_max = *std::max_element(logits.begin(), logits.end());
|
|
4994
|
-
float logsumexp = 0.0f;
|
|
4995
|
-
for (int i = 0; i < n_logits; ++i) {
|
|
4996
|
-
if (logits[i] > -INFINITY) {
|
|
4997
|
-
logsumexp += expf(logits[i] - logit_max);
|
|
4998
|
-
}
|
|
4999
|
-
}
|
|
5000
|
-
logsumexp = logf(logsumexp) + logit_max;
|
|
5001
|
-
|
|
5002
|
-
for (int i = 0; i < n_logits; ++i) {
|
|
5003
|
-
if (logits[i] > -INFINITY) {
|
|
5004
|
-
logprobs[i] = logits[i] - logsumexp;
|
|
5005
|
-
} else {
|
|
5006
|
-
logprobs[i] = -INFINITY;
|
|
5007
|
-
}
|
|
5008
|
-
}
|
|
5009
|
-
}
|
|
5121
|
+
whisper_compute_logprobs(logits, n_logits, logprobs);
|
|
5010
5122
|
|
|
5011
5123
|
// if sum of probability over timestamps is above any other token, sample timestamp
|
|
5012
5124
|
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L431-L437
|
|
@@ -5064,15 +5176,7 @@ static void whisper_process_logits(
|
|
|
5064
5176
|
}
|
|
5065
5177
|
|
|
5066
5178
|
// compute probs
|
|
5067
|
-
|
|
5068
|
-
for (int i = 0; i < n_logits; ++i) {
|
|
5069
|
-
if (logits[i] == -INFINITY) {
|
|
5070
|
-
probs[i] = 0.0f;
|
|
5071
|
-
} else {
|
|
5072
|
-
probs[i] = expf(logprobs[i]);
|
|
5073
|
-
}
|
|
5074
|
-
}
|
|
5075
|
-
}
|
|
5179
|
+
whisper_compute_probs(logits, n_logits, logprobs, probs);
|
|
5076
5180
|
|
|
5077
5181
|
#if 0
|
|
5078
5182
|
// print first 100 logits - token string : logit
|
|
@@ -5651,6 +5755,18 @@ int whisper_full_with_state(
|
|
|
5651
5755
|
return -8;
|
|
5652
5756
|
}
|
|
5653
5757
|
|
|
5758
|
+
// Calculate no_speech probability after first decode.
|
|
5759
|
+
// This has to be done before any logit filtering. Hence we cannot use the probs from the whisper_process_logits.
|
|
5760
|
+
{
|
|
5761
|
+
const int n_logits = ctx->vocab.id_to_token.size();
|
|
5762
|
+
std::vector<float> logprobs(n_logits);
|
|
5763
|
+
std::vector<float> probs(n_logits);
|
|
5764
|
+
|
|
5765
|
+
whisper_compute_logprobs(state->logits, n_logits, logprobs);
|
|
5766
|
+
whisper_compute_probs(state->logits, n_logits, logprobs, probs);
|
|
5767
|
+
state->no_speech_prob = probs[whisper_token_nosp(ctx)];
|
|
5768
|
+
}
|
|
5769
|
+
|
|
5654
5770
|
{
|
|
5655
5771
|
const int64_t t_start_sample_us = wsp_ggml_time_us();
|
|
5656
5772
|
|
|
@@ -6042,8 +6158,9 @@ int whisper_full_with_state(
|
|
|
6042
6158
|
if (it != (int) temperatures.size() - 1) {
|
|
6043
6159
|
const auto & decoder = state->decoders[best_decoder_id];
|
|
6044
6160
|
|
|
6045
|
-
if (decoder.failed ||
|
|
6046
|
-
|
|
6161
|
+
if (decoder.failed ||
|
|
6162
|
+
(decoder.sequence.avg_logprobs < params.logprob_thold && state->no_speech_prob < params.no_speech_thold)) {
|
|
6163
|
+
WHISPER_LOG_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f and no_speech_prob %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold, state->no_speech_prob, params.no_speech_thold);
|
|
6047
6164
|
success = false;
|
|
6048
6165
|
state->n_fail_p++;
|
|
6049
6166
|
}
|
|
@@ -6064,7 +6181,7 @@ int whisper_full_with_state(
|
|
|
6064
6181
|
{
|
|
6065
6182
|
const auto & best_decoder = state->decoders[best_decoder_id];
|
|
6066
6183
|
|
|
6067
|
-
|
|
6184
|
+
auto seek_delta = best_decoder.seek_delta;
|
|
6068
6185
|
const auto result_len = best_decoder.sequence.result_len;
|
|
6069
6186
|
|
|
6070
6187
|
const auto & tokens_cur = best_decoder.sequence.tokens;
|
|
@@ -6072,6 +6189,9 @@ int whisper_full_with_state(
|
|
|
6072
6189
|
// [EXPERIMENTAL] Token-level timestamps with DTW
|
|
6073
6190
|
const auto n_segments_before = state->result_all.size();
|
|
6074
6191
|
|
|
6192
|
+
const bool is_no_speech = (state->no_speech_prob > params.no_speech_thold &&
|
|
6193
|
+
best_decoder.sequence.avg_logprobs < params.logprob_thold);
|
|
6194
|
+
|
|
6075
6195
|
//WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
|
|
6076
6196
|
|
|
6077
6197
|
// update prompt_past
|
|
@@ -6080,11 +6200,11 @@ int whisper_full_with_state(
|
|
|
6080
6200
|
prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
|
|
6081
6201
|
}
|
|
6082
6202
|
|
|
6083
|
-
for (int i = 0; i < result_len; ++i) {
|
|
6203
|
+
for (int i = 0; i < result_len && !is_no_speech; ++i) {
|
|
6084
6204
|
prompt_past.push_back(tokens_cur[i].id);
|
|
6085
6205
|
}
|
|
6086
6206
|
|
|
6087
|
-
if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
|
|
6207
|
+
if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) {
|
|
6088
6208
|
int i0 = 0;
|
|
6089
6209
|
auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
|
|
6090
6210
|
|
|
@@ -6123,7 +6243,7 @@ int whisper_full_with_state(
|
|
|
6123
6243
|
|
|
6124
6244
|
//printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
|
|
6125
6245
|
|
|
6126
|
-
result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
|
|
6246
|
+
result_all.push_back({ tt0, tt1, text, state->no_speech_prob, {}, speaker_turn_next });
|
|
6127
6247
|
for (int j = i0; j <= i; j++) {
|
|
6128
6248
|
result_all.back().tokens.push_back(tokens_cur[j]);
|
|
6129
6249
|
}
|
|
@@ -6168,7 +6288,7 @@ int whisper_full_with_state(
|
|
|
6168
6288
|
}
|
|
6169
6289
|
}
|
|
6170
6290
|
|
|
6171
|
-
result_all.push_back({ tt0, tt1, text, {}
|
|
6291
|
+
result_all.push_back({ tt0, tt1, text, state->no_speech_prob, {}, speaker_turn_next });
|
|
6172
6292
|
for (int j = i0; j < (int) tokens_cur.size(); j++) {
|
|
6173
6293
|
result_all.back().tokens.push_back(tokens_cur[j]);
|
|
6174
6294
|
}
|
|
@@ -6205,6 +6325,15 @@ int whisper_full_with_state(
|
|
|
6205
6325
|
}
|
|
6206
6326
|
}
|
|
6207
6327
|
|
|
6328
|
+
// ref: https://github.com/ggerganov/whisper.cpp/pull/2629
|
|
6329
|
+
const bool single_timestamp_ending = tokens_cur.size() > 1 &&
|
|
6330
|
+
tokens_cur[tokens_cur.size() - 2].id < whisper_token_beg(ctx) &&
|
|
6331
|
+
tokens_cur[tokens_cur.size() - 1].id > whisper_token_beg(ctx);
|
|
6332
|
+
if (single_timestamp_ending) {
|
|
6333
|
+
WHISPER_LOG_DEBUG("single timestamp ending - skip entire chunk\n");
|
|
6334
|
+
seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100);
|
|
6335
|
+
}
|
|
6336
|
+
|
|
6208
6337
|
// update audio window
|
|
6209
6338
|
seek += seek_delta;
|
|
6210
6339
|
|
|
@@ -6426,6 +6555,14 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
|
|
|
6426
6555
|
return ctx->state->result_all[i_segment].tokens[i_token].p;
|
|
6427
6556
|
}
|
|
6428
6557
|
|
|
6558
|
+
float whisper_full_get_segment_no_speech_prob(struct whisper_context * ctx, int i_segment) {
|
|
6559
|
+
return ctx->state->result_all[i_segment].no_speech_prob;
|
|
6560
|
+
}
|
|
6561
|
+
|
|
6562
|
+
float whisper_full_get_segment_no_speech_prob_from_state(struct whisper_state * state, int i_segment) {
|
|
6563
|
+
return state->result_all[i_segment].no_speech_prob;
|
|
6564
|
+
}
|
|
6565
|
+
|
|
6429
6566
|
// =================================================================================================
|
|
6430
6567
|
|
|
6431
6568
|
//
|
|
@@ -6587,6 +6724,8 @@ WHISPER_API int whisper_bench_wsp_ggml_mul_mat(int n_threads) {
|
|
|
6587
6724
|
}
|
|
6588
6725
|
|
|
6589
6726
|
WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
|
|
6727
|
+
whisper_load_backends();
|
|
6728
|
+
|
|
6590
6729
|
static std::string s;
|
|
6591
6730
|
s = "";
|
|
6592
6731
|
char strbuf[256];
|
|
@@ -6606,7 +6745,6 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
|
|
|
6606
6745
|
// c: N*N*sizeof(float)
|
|
6607
6746
|
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
|
6608
6747
|
std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*wsp_ggml_tensor_overhead() + wsp_ggml_graph_overhead());
|
|
6609
|
-
std::vector<uint8_t> work;
|
|
6610
6748
|
|
|
6611
6749
|
// put a bunch of random data in the buffer
|
|
6612
6750
|
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
@@ -6663,12 +6801,12 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
|
|
|
6663
6801
|
double tsum = 0.0;
|
|
6664
6802
|
|
|
6665
6803
|
// heat-up
|
|
6666
|
-
wsp_ggml_graph_compute_helper(gf,
|
|
6804
|
+
wsp_ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
|
|
6667
6805
|
|
|
6668
6806
|
for (int i = 0; i < n_max; ++i) {
|
|
6669
6807
|
const int64_t t0 = wsp_ggml_time_us();
|
|
6670
6808
|
|
|
6671
|
-
wsp_ggml_graph_compute_helper(gf,
|
|
6809
|
+
wsp_ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
|
|
6672
6810
|
|
|
6673
6811
|
const int64_t t1 = wsp_ggml_time_us();
|
|
6674
6812
|
|
|
@@ -7045,18 +7183,18 @@ static wsp_ggml_tensor * dtw_and_backtrace(wsp_ggml_context * ctx, wsp_ggml_tens
|
|
|
7045
7183
|
struct wsp_ggml_tensor * cost = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, N + 1, M + 1);
|
|
7046
7184
|
struct wsp_ggml_tensor * trace = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_I32, N + 1, M + 1);
|
|
7047
7185
|
|
|
7048
|
-
cost =
|
|
7049
|
-
trace =
|
|
7050
|
-
|
|
7186
|
+
cost = whisper_set_f32(cost, INFINITY);
|
|
7187
|
+
trace = whisper_set_i32(trace, -1);
|
|
7188
|
+
whisper_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
|
|
7051
7189
|
|
|
7052
7190
|
// dtw
|
|
7053
7191
|
// supposedly can be optmized by computing diagonals in parallel ?
|
|
7054
7192
|
// Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
|
|
7055
7193
|
for (int64_t j = 1; j < M + 1; ++j) {
|
|
7056
7194
|
for (int64_t i = 1; i < N + 1; ++i) {
|
|
7057
|
-
float c0 =
|
|
7058
|
-
float c1 =
|
|
7059
|
-
float c2 =
|
|
7195
|
+
float c0 = whisper_get_f32_nd(cost, i - 1, j - 1, 0, 0);
|
|
7196
|
+
float c1 = whisper_get_f32_nd(cost, i - 1, j, 0, 0);
|
|
7197
|
+
float c2 = whisper_get_f32_nd(cost, i, j - 1, 0, 0);
|
|
7060
7198
|
|
|
7061
7199
|
float c;
|
|
7062
7200
|
int32_t t;
|
|
@@ -7071,9 +7209,9 @@ static wsp_ggml_tensor * dtw_and_backtrace(wsp_ggml_context * ctx, wsp_ggml_tens
|
|
|
7071
7209
|
t = 2;
|
|
7072
7210
|
}
|
|
7073
7211
|
|
|
7074
|
-
c =
|
|
7075
|
-
|
|
7076
|
-
|
|
7212
|
+
c = whisper_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
|
|
7213
|
+
whisper_set_f32_nd(cost, i, j, 0, 0, c);
|
|
7214
|
+
whisper_set_i32_nd(trace, i, j, 0, 0, t);
|
|
7077
7215
|
}
|
|
7078
7216
|
}
|
|
7079
7217
|
|
|
@@ -7082,19 +7220,19 @@ static wsp_ggml_tensor * dtw_and_backtrace(wsp_ggml_context * ctx, wsp_ggml_tens
|
|
|
7082
7220
|
struct wsp_ggml_tensor * bt = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_I32, BT_MAX_ROWS, 2);
|
|
7083
7221
|
// trace[0, :] = 2;
|
|
7084
7222
|
for (int64_t i = 0; i < M + 1; ++i)
|
|
7085
|
-
|
|
7223
|
+
whisper_set_i32_nd(trace, 0, i, 0, 0, 2);
|
|
7086
7224
|
//trace[:, 0] = 1;
|
|
7087
7225
|
for (int64_t i = 0; i < N + 1; ++i)
|
|
7088
|
-
|
|
7226
|
+
whisper_set_i32_nd(trace, i, 0, 0, 0, 1);
|
|
7089
7227
|
int bt_row_idx = BT_MAX_ROWS - 1;
|
|
7090
7228
|
int64_t i = N;
|
|
7091
7229
|
int64_t j = M;
|
|
7092
7230
|
while (i > 0 || j > 0) {
|
|
7093
|
-
|
|
7094
|
-
|
|
7231
|
+
whisper_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
|
|
7232
|
+
whisper_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
|
|
7095
7233
|
--bt_row_idx;
|
|
7096
7234
|
|
|
7097
|
-
int32_t t =
|
|
7235
|
+
int32_t t = whisper_get_i32_nd(trace, i, j, 0, 0);
|
|
7098
7236
|
if (t == 0) {
|
|
7099
7237
|
--i;
|
|
7100
7238
|
--j;
|
|
@@ -7115,8 +7253,8 @@ static wsp_ggml_tensor * dtw_and_backtrace(wsp_ggml_context * ctx, wsp_ggml_tens
|
|
|
7115
7253
|
wsp_ggml_tensor * r = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_I32, 2, result_n_cols);
|
|
7116
7254
|
for (int64_t i = 0; i < 2; ++i) {
|
|
7117
7255
|
for (int64_t j = 0; j < result_n_cols; ++j) {
|
|
7118
|
-
int32_t v =
|
|
7119
|
-
|
|
7256
|
+
int32_t v = whisper_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
|
|
7257
|
+
whisper_set_i32_nd(r, i, j, 0, 0, v);
|
|
7120
7258
|
}
|
|
7121
7259
|
}
|
|
7122
7260
|
|
|
@@ -7151,11 +7289,11 @@ static void median_filter(struct wsp_ggml_tensor * dst , const struct wsp_ggml_t
|
|
|
7151
7289
|
idx = 2*(a->ne[2] - 1) - idx;
|
|
7152
7290
|
}
|
|
7153
7291
|
|
|
7154
|
-
filter.push_back(
|
|
7292
|
+
filter.push_back(whisper_get_f32_nd(a, i, j, idx, 0));
|
|
7155
7293
|
}
|
|
7156
7294
|
std::sort(filter.begin(), filter.end());
|
|
7157
7295
|
const float v = filter[filter.size()/2];
|
|
7158
|
-
|
|
7296
|
+
whisper_set_f32_nd(dst, i, j, k, 0, v);
|
|
7159
7297
|
filter.clear();
|
|
7160
7298
|
}
|
|
7161
7299
|
}
|
|
@@ -7277,7 +7415,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
|
|
|
7277
7415
|
// Compute
|
|
7278
7416
|
struct wsp_ggml_cgraph * gf = wsp_ggml_new_graph(gctx);
|
|
7279
7417
|
wsp_ggml_build_forward_expand(gf, w);
|
|
7280
|
-
|
|
7418
|
+
|
|
7419
|
+
wsp_ggml_backend_ptr backend { wsp_ggml_backend_init_by_type(WSP_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
|
|
7420
|
+
wsp_ggml_backend_graph_compute(backend.get(), gf);
|
|
7281
7421
|
|
|
7282
7422
|
wsp_ggml_tensor * alignment = dtw_and_backtrace(gctx, w);
|
|
7283
7423
|
|
|
@@ -7286,9 +7426,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
|
|
|
7286
7426
|
auto seg_i = state->result_all.begin() + i_segment;
|
|
7287
7427
|
auto tok_i = seg_i->tokens.begin();
|
|
7288
7428
|
for (int i = 0; i < alignment->ne[1]; ++i) {
|
|
7289
|
-
int32_t v =
|
|
7429
|
+
int32_t v = whisper_get_i32_nd(alignment, 0, i, 0, 0);
|
|
7290
7430
|
if (v != last_v) {
|
|
7291
|
-
int32_t time_index =
|
|
7431
|
+
int32_t time_index = whisper_get_i32_nd(alignment, 1, i, 0, 0);
|
|
7292
7432
|
int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
|
|
7293
7433
|
last_v = v;
|
|
7294
7434
|
|